def saveClusters(directions_fn,
                 scores_fn,
                 names_fn,
                 filename,
                 amt_of_dirs,
                 data_type,
                 cluster_amt,
                 rewrite_files=False,
                 algorithm="meanshift_k"):

    dict_fn = "../data/" + data_type + "/cluster/dict/" + filename + ".txt"
    cluster_directions_fn = "../data/" + data_type + "/cluster/clusters/" + filename + ".txt"

    all_fns = [dict_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", saveClusters.__name__)
        return
    else:
        print("Running task", saveClusters.__name__)

    p_dir = dt.import2dArray(directions_fn)
    p_names = dt.import1dArray(names_fn, "s")
    p_scores = dt.import1dArray(scores_fn, "f")

    ids = np.argsort(p_scores)

    p_dir = np.flipud(p_dir[ids])[:amt_of_dirs]
    p_names = np.flipud(p_names[ids])[:amt_of_dirs]
    if algorithm == "meanshift":
        labels = meanShift(p_dir)
    else:
        labels = kMeans(p_dir, cluster_amt)
    unique, counts = np.unique(labels, return_counts=True)

    clusters = []
    dir_clusters = []
    for i in range(len(unique)):
        clusters.append([])
        dir_clusters.append([])
    for i in range(len(labels)):
        clusters[labels[i]].append(p_names[i])
        dir_clusters[labels[i]].append(p_dir[i])
    cluster_directions = []
    for l in range(len(dir_clusters)):
        cluster_directions.append(dt.mean_of_array(dir_clusters[l]))

    print("------------------------")
    for c in clusters:
        print(c)
    print("------------------------")

    dt.write2dArray(clusters, dict_fn)
    dt.write2dArray(cluster_directions, cluster_directions_fn)
예제 #2
0
def nameClustersMeanDirection(cluster_directions):
    # Import the word vectors from Wikipedia
    file = open("../data/wikipedia/word_vectors/glove.6B.50d.txt",
                encoding="utf8")
    lines = file.readlines()
    word_vectors = []
    word_vector_names = []
    for l in lines:
        l = l.split()
        word_vector_names.append(l[0])
        del l[0]
        for i in range(len(l)):
            l[i] = float(l[i])
        word_vectors.append(l)
    words = []
    for key, value in cluster_directions.items():
        for v in range(len(value) - 1, -1, -1):
            if key == value[v] or key == value[v][:-1] or key[:-1] == value[v]:
                print("deleted", value[v], key)
                value[v] = "DELETE"
            for val in reversed(value):
                if val == value[v][:-1] or val[:-1] == value[v]:
                    print("deleted", value[v], val)
                    value[v] = "DELETE"
    for key, value in cluster_directions.items():
        for v in range(len(value) - 1, -1, -1):
            if value[v] == "DELETE":
                del value[v]

    for key, value in cluster_directions.items():
        cluster_word_vectors = []
        for w in range(len(word_vector_names)):
            if word_vector_names[w].strip() == key.strip():
                cluster_word_vectors.append(word_vectors[w])
                print("Success", key)
                break
            if w == len(word_vector_names) - 1:
                print("Failed", key)

        for v in range(len(value)):
            for w in range(len(word_vector_names)):
                if v > 9:
                    break
                if word_vector_names[w].strip() == value[v].strip():
                    cluster_word_vectors.append(word_vectors[w])
                    print("Success", value[v])
                    break
                if w == len(word_vector_names) - 1:
                    print("Failed", value[v])
        if len(cluster_word_vectors) > 0:
            mean_vector = dt.mean_of_array(cluster_word_vectors)
            print(mean_vector)
            print(cluster_word_vectors[0])
            h_sim = 0
            closest_word = ""
            for v in range(len(word_vectors)):
                sim = st.getSimilarity(word_vectors[v], mean_vector)
                if sim > h_sim:
                    print("New highest sim", word_vector_names[v])
                    h_sim = sim
                    closest_word = word_vector_names[v]
            print("Closest Word", closest_word)
            words.append(closest_word)
        else:
            words.append(key)
    return words
예제 #3
0
def createTermClusters(hv_directions, lv_directions, hv_names, lv_names,
                       amt_of_clusters, dont_cluster):
    least_similar_clusters = []
    least_similar_cluster_ids = []
    least_similar_cluster_names = []

    print("Overall amount of HV directions: ", len(hv_directions))
    # Create high-valued clusters
    least_similar_cluster_ids.append(0)
    least_similar_clusters.append(hv_directions[0])
    least_similar_cluster_names.append(hv_names[0])
    print("Least Similar Term", hv_names[0])

    hv_to_delete = [0]
    for i in range(len(hv_directions)):
        if i >= amt_of_clusters - 1:
            break
        else:
            ti = st.getNextClusterTerm(least_similar_clusters, hv_directions,
                                       least_similar_cluster_ids, 1)
            least_similar_cluster_ids.append(ti)
            least_similar_clusters.append(hv_directions[ti])
            least_similar_cluster_names.append(hv_names[ti])
            hv_to_delete.append(ti)
            print(
                str(i + 1) + "/" + str(amt_of_clusters), "Least Similar Term",
                hv_names[ti])

            # Add remaining high value directions to the low value direction list
    if dont_cluster == 0:
        hv_directions = np.asarray(hv_directions)
        hv_names = np.asarray(hv_names)

        hv_directions = np.delete(hv_directions, hv_to_delete, 0)
        hv_names = np.delete(hv_names, hv_to_delete, 0)

        for i in range(len(hv_directions)):
            lv_directions.insert(0, hv_directions[i])
            lv_names.insert(0, hv_names[i])

        # Initialize dictionaries for printing / visualizing
        cluster_name_dict = OrderedDict()
        for c in least_similar_cluster_names:
            cluster_name_dict[c] = []

        # For every low value direction, find the high value direction its most similar to and append it to the directions
        every_cluster_direction = []
        for i in least_similar_clusters:
            every_cluster_direction.append([i])

        # Finding the most similar directions to each cluster_centre
        # Creating a dictionary of {cluster_centre: [cluster_direction(1), ..., cluster_direction(n)]} pairs
        for d in range(len(lv_directions)):
            i = st.getXMostSimilarIndex(lv_directions[d],
                                        least_similar_clusters, [], 1)[0]
            every_cluster_direction[i].append(lv_directions[d])
            print(
                str(d + 1) + "/" + str(len(lv_directions)), "Most Similar to",
                lv_names[d], "Is", least_similar_cluster_names[i])
            cluster_name_dict[least_similar_cluster_names[i]].append(
                lv_names[d])

        # Mean of all directions = cluster direction
        cluster_directions = []
        for l in range(len(least_similar_clusters)):
            cluster_directions.append(
                dt.mean_of_array(every_cluster_direction[l]))
    else:
        cluster_name_dict = OrderedDict()
        for c in least_similar_cluster_names:
            cluster_name_dict[c] = []

        # For every low value direction, find the high value direction its most similar to and append it to the directions
        every_cluster_direction = []
        for i in least_similar_clusters:
            every_cluster_direction.append([i])
        cluster_directions = least_similar_clusters

    return cluster_directions, least_similar_cluster_names, cluster_name_dict, least_similar_clusters
예제 #4
0
def nameClustersRemoveOutliers(cluster_directions):
    # Import the word vectors from Wikipedia
    file = open("../data/wikipedia/word_vectors/glove.6B.50d.txt",
                encoding="utf8")
    lines = file.readlines()
    wv = []
    wvn = []
    # Create an array of word vectors from the text file
    for l in lines:
        l = l.split()
        wvn.append(l[0])
        del l[0]
        for i in range(len(l)):
            l[i] = float(l[i])
        wv.append(l)
    words = []
    for key, value in cluster_directions.items():
        for v in range(len(value) - 1, -1, -1):
            if key == value[v] or key == value[v][:-1] or key[:-1] == value[v]:
                print("deleted", value[v], key)
                value[v] = "DELETE"
            for val in reversed(value):
                if val == value[v][:-1] or val[:-1] == value[v]:
                    print("deleted", value[v], val)
                    value[v] = "DELETE"
    for key, value in cluster_directions.items():
        for v in range(len(value) - 1, -1, -1):
            if value[v] == "DELETE":
                del value[v]

    # For every cluster (key: cluster center, value: similar terms)
    for key, value in cluster_directions.items():
        # If the center/values in the vector have a corresponding word vector, add the vectors to an array
        cluster_word_vectors = []
        cluster_word_vector_names = []
        for w in range(len(wvn)):
            if wvn[w].strip() == key.strip():
                cluster_word_vectors.append(wv[w])
                cluster_word_vector_names.append(wvn[w])
                print("Success", key)
                break
            if w == len(wvn) - 1:
                print("Failed", key)
        for v in range(len(value)):
            for w in range(len(wvn)):
                if v > 9:
                    break
                if wvn[w].strip() == value[v].strip():
                    cluster_word_vectors.append(wv[w])
                    cluster_word_vector_names.append(wvn[w])
                    print("Success", value[v])
                    break
                if w == len(wvn) - 1:
                    print("Failed", value[v])

        # If we found word vectors
        if len(cluster_word_vectors) > 1:

            # Get the angular distance between every word vector, and find the minimum angular distance point
            min_ang_dist = 214700000
            min_index = None
            ang_dists = np.zeros(
                [len(cluster_word_vectors),
                 len(cluster_word_vectors)])
            for i in range(len(cluster_word_vectors)):
                total_dist = 0
                for j in range(len(cluster_word_vectors)):
                    dist = spatial.distance.cosine(cluster_word_vectors[i],
                                                   cluster_word_vectors[j])
                    if ang_dists[i][j] == 0:
                        ang_dists[i][j] = dist
                    total_dist += dist
                if total_dist < min_ang_dist:
                    min_ang_dist = total_dist
                    min_index = i
                    print("New min word:",
                          cluster_word_vector_names[min_index])

            medoid_wv = []
            medoid_wvn = []
            # Delete outliers
            for i in range(len(cluster_word_vectors)):
                threshold = 0.8
                dist = spatial.distance.cosine(cluster_word_vectors[min_index],
                                               cluster_word_vectors[i])
                if dist < threshold:
                    medoid_wv.append(cluster_word_vectors[i])
                    medoid_wvn.append(cluster_word_vector_names[i])
                else:
                    print("Deleted outlier", cluster_word_vector_names[i])
            if len(medoid_wv) > 1:
                # Get the mean direction of non-outlier directions
                mean_vector = dt.mean_of_array(medoid_wv)
                # Find the most similar vector to that mean
                h_sim = 0
                closest_word = ""
                for v in range(len(wv)):
                    sim = st.getSimilarity(wv[v], mean_vector)
                    if sim > h_sim:
                        print("New highest sim", wvn[v])
                        h_sim = sim
                        closest_word = wvn[v]
                print("Closest Word", closest_word)
                words.append(closest_word)
            else:
                words.append(medoid_wvn[0])
        else:
            words.append(key)
    return words
예제 #5
0
def nameClustersRemoveOutliersWeight(cluster_directions, weights_fn, is_gini):
    # Import the word vectors from Wikipedia

    weights = dt.import1dArray(weights_fn)
    weights = [float(w) for w in weights]
    phrases = dt.import1dArray("../data/movies/bow/names/200.txt")
    wv, wvn = dt.getWordVectors()
    words = []
    for key, value in cluster_directions.items():
        for v in range(len(value) - 1, -1, -1):
            if key == value[v] or key == value[v][:-1] or key[:-1] == value[v]:
                print("deleted", value[v], key)
                value[v] = "DELETE"
            for val in reversed(value):
                if val == value[v][:-1] or val[:-1] == value[v]:
                    print("deleted", value[v], val)
                    value[v] = "DELETE"
    for key, value in cluster_directions.items():
        for v in range(len(value) - 1, -1, -1):
            if value[v] == "DELETE":
                del value[v]

    # For every cluster (key: cluster center, value: similar terms)
    for key, value in cluster_directions.items():
        # If the center/values in the vector have a corresponding word vector, add the vectors to an array
        cluster_word_vectors = []
        cluster_word_vector_names = []
        for w in range(len(wvn)):
            if wvn[w].strip() == key.strip():
                cluster_word_vectors.append(wv[w])
                cluster_word_vector_names.append(wvn[w])
                print("Success", key)
                break
            if w == len(wvn) - 1:
                print("Failed", key)
        for v in range(len(value)):
            for w in range(len(wvn)):
                if v > 9:
                    break
                if wvn[w].strip() == value[v].strip():
                    cluster_word_vectors.append(wv[w])
                    cluster_word_vector_names.append(wvn[w])
                    print("Success", value[v])
                    break
                if w == len(wvn) - 1:
                    print("Failed", value[v])

        # If we found word vectors
        if len(cluster_word_vectors) > 1:

            # Get the angular distance between every word vector, and find the minimum angular distance point
            min_ang_dist = 214700000
            min_index = None
            ang_dists = np.zeros(
                [len(cluster_word_vectors),
                 len(cluster_word_vectors)])
            for i in range(len(cluster_word_vectors)):
                total_dist = 0
                for j in range(len(cluster_word_vectors)):
                    dist = spatial.distance.cosine(cluster_word_vectors[i],
                                                   cluster_word_vectors[j])
                    if ang_dists[i][j] == 0:
                        ang_dists[i][j] = dist
                    total_dist += dist
                if total_dist < min_ang_dist:
                    min_ang_dist = total_dist
                    min_index = i
                    print("New min word:",
                          cluster_word_vector_names[min_index])

            medoid_wv = []
            medoid_wvn = []
            # Delete outliers
            for i in range(len(cluster_word_vectors)):
                threshold = 0.8
                dist = spatial.distance.cosine(cluster_word_vectors[min_index],
                                               cluster_word_vectors[i])
                if dist < threshold:
                    medoid_wv.append(cluster_word_vectors[i])
                    medoid_wvn.append(cluster_word_vector_names[i])
                else:
                    print("Deleted outlier", cluster_word_vector_names[i])
            if len(medoid_wv) > 1:
                si = []
                for wvna in medoid_wvn:
                    for w in range(len(phrases)):
                        if phrases[w][6:] == wvna:
                            si.append(w)
                a_weights = []
                for s in si:
                    a_weights.append(weights[s])
                if is_gini:
                    for s in range(len(a_weights)):
                        a_weights[s] = 1.0 - a_weights[s]
                for m in range(len(medoid_wv)):
                    for a in range(len(medoid_wv[m])):
                        medoid_wv[m][a] = medoid_wv[m][a] * a_weights[m]
                # Get the mean direction of non-outlier directions
                mean_vector = dt.mean_of_array(medoid_wv)
                # Find the most similar vector to that mean
                h_sim = 0
                closest_word = ""
                for v in range(len(wv)):
                    sim = st.getSimilarity(wv[v], mean_vector)
                    if sim > h_sim:
                        print("New highest sim", wvn[v])
                        h_sim = sim
                        closest_word = wvn[v]
                print("Closest Word", closest_word)
                words.append(closest_word)
            else:
                words.append(medoid_wvn[0])
        else:
            words.append(key)
    return words