Пример #1
0
def selectCutOffByWordVector(cutoff_fn, cluster_dict_fn, file_name):
    cutoff = dt.import2dArray(cutoff_fn)
    cluster_dict = dt.readArrayDict(cluster_dict_fn)
    cutoff_words = []
    wv, wvn = dt.getWordVectors()
    cluster_boundary = 2
    cluster_dict_arrays = []
    for key, value in cluster_dict.items():
        cluster_array = []
        cluster_array.append(key)
        for v in value:
            cluster_array.append(v)
        cluster_dict_arrays.append(cluster_array)
    for c in range(len(cutoff)):
        clusters = []
        for i in range(len(cutoff[c])):
            cluster = []
            for x in range(len(cutoff[c]) - 1, -1, -1):
                if cutoff[c][x] is None or cutoff[c][i] is None:
                    continue
                if abs(cutoff[c][i] - cutoff[c][x]) <= cluster_boundary:
                    cluster.append(cluster_dict_arrays[c][x])
                    cutoff[c][x] = None
                    cluster_dict_arrays[c][x] = None
            if cluster is []:
                continue
            clusters.append(cluster)
        # Get the maximum similarity word vector value for each cluster, across all clusters
        for cl in range(len(clusters)):
            for wa in range(len(clusters[cl])):
                for w in range(len(clusters[cl][wa])):
                    clusters[cl[wa]]

    dt.write2dArray(cutoff_words,
                    "../data/movies/rules/cutoff/" + file_name + "WVN.txt")
Пример #2
0
def fixCutoffFormatting(cutoff_fn, file_name):
    cutoff = dt.import1dArray(cutoff_fn)
    cluster_dict = dt.readArrayDict(cluster_dict_fn)
    for c in range(len(cutoff)):
        cutoff[c] = cutoff[c].split()
        for i in range(len(cutoff[c])):
            cutoff[c][i] = int(dt.stripPunctuation(cutoff[c][i]))
    dt.write2dArray(cutoff,
                    "../data/movies/rules/cutoff/" + file_name + ".txt")
Пример #3
0
def selectCutOffByExplanation(cutoff_fn, cluster_dict_fn, file_name):
    cutoff = dt.import2dArray(cutoff_fn)
    dupe_cutoff = copy.deepcopy(cutoff)
    cluster_dict = dt.readArrayDict(cluster_dict_fn)
    cutoff_words = []
    cluster_boundary = 2
    cluster_dict_arrays = []
    for key, value in cluster_dict.items():
        cluster_array = []
        cluster_array.append(key)
        for v in value:
            cluster_array.append(v)
        cluster_dict_arrays.append(cluster_array)
    explanations = []
    explanation_cutoffs = []
    for c in range(len(cutoff)):
        clusters = []
        for i in range(len(cutoff[c])):
            cluster = []
            for x in range(len(cutoff[c]) - 1, -1, -1):
                if cutoff[c][x] is None or cutoff[c][i] is None:
                    continue
                if abs(cutoff[c][i] - cutoff[c][x]) <= cluster_boundary:
                    cluster.append(cluster_dict_arrays[c][x])
                    cutoff[c][x] = None
                    cluster_dict_arrays[c][x] = None
            if cluster is []:
                continue
            clusters.append(cluster)
        # Get the m  vvcaximum similarity word vector value for each cluster, across all clusters
        # For each cluster
        explained_cutoff = []
        explained_cutoff_value = []
        for cl in range(len(clusters)):
            if len(clusters[cl]) == 0:
                print("Skipped")
                continue
            cluster_explanation, winning_index = webapi.getHighestScore(
                clusters[cl])
            explained_cutoff.append(cluster_explanation + ",")

            dict_index = 0
            for h in range(len(cluster_dict_arrays[cl])):
                if cluster_dict_arrays[cl][h] == clusters[cl][winning_index]:
                    dict_index = h
            explained_cutoff_value.append(dupe_cutoff[cl][dict_index])
        explanations.append(explained_cutoff)
        explanation_cutoffs.append(explained_cutoff_value)
    dt.write2dArray(
        explanations,
        "../data/movies/rules/final_names/" + file_name + "WVN.txt")
    dt.write2dArray(explanation_cutoffs,
                    "../data/movies/rules/final_cutoff/" + file_name + ".txt")
Пример #4
0
def binaryInCluster(cluster_dict_fn, fn):
    cluster = dt.readArrayDict(cluster_dict_fn)
    all_cluster_output = []
    for key, items in cluster.items():
        init_binary = dt.import1dArray(
            "../data/movies/bow/binary/phrases/" + key, "i")
        for i in items:
            binary = dt.import1dArray("../data/movies/bow/binary/phrases/" + i,
                                      "i")
            for j in range(len(init_binary)):
                if binary[j] == 1:
                    init_binary[j] = 1
        all_cluster_output.append(init_binary)
    dt.write2dArray(all_cluster_output,
                    "../data/movies/finetune/" + fn + "InCluster.txt")
Пример #5
0
def getCutOff(cluster_dict_fn, rankings_fn, file_name):

    cluster_dict = dt.readArrayDict(cluster_dict_fn)
    rankings = dt.importDiscreteVectors(rankings_fn)

    for r in rankings:
        for a in range(len(r)):
            r[a] = int(r[a][:-1])

    cutoff_clusters = []
    counter = 0
    for key, value in cluster_dict.items():
        value.insert(0, key)
        cutoffs = []
        for v in value:
            max_score = 0
            cutoff = 0
            for i in range(1, 101):
                y_pred = []
                for ve in range(len(rankings[counter])):
                    rank = rankings[counter][ve]
                    if rank > i:
                        y_pred.append(0)
                    else:
                        y_pred.append(1)
                y_test = dt.import2dArray(
                    "../data/movies/bow/frequency/phrases/class-" + v, "s")
                score = cohen_kappa_score(y_test, y_pred)
                print(v, int(i), "Score", score)
                if score > max_score:
                    max_score = score
                    cutoff = i
            cutoffs.append(cutoff)
            print("Cutoff for", v, "On", key, "Was", str(cutoff))
        cutoff_clusters.append(cutoffs)
        counter += 1
    dt.write2dArray(cutoff_clusters,
                    "../data/movies/rules/cutoff/" + file_name + ".txt")