Пример #1
0
    def __init__(self, directions_fn, vectors_fn, cluster_names_fn,
                 vector_names_fn, fn, percent, percentage_increment,
                 by_vector):

        directions = dt.importVectors(directions_fn)
        vectors = dt.importVectors(vectors_fn)
        cluster_names = dt.importString(cluster_names_fn)
        vector_names = dt.importString(vector_names_fn)

        rankings = self.getRankings(directions, vectors, cluster_names,
                                    vector_names)
        rankings = np.array(rankings)
        #labels = self.createLabels(rankings, percent)
        #labels = np.asarray(labels)
        discrete_labels = self.createDiscreteLabels(rankings,
                                                    percentage_increment)
        discrete_labels = np.asarray(discrete_labels)
        if by_vector:
            #labels = labels.transpose()
            discrete_labels = discrete_labels.transpose()
            rankings = rankings.transpose()
        #dt.write2dArray(labels, "Rankings/" + fn + "P" + str(percent) +".labels")
        dt.write2dArray(rankings, "Rankings/" + fn + ".space")
        dt.write2dArray(
            discrete_labels,
            "Rankings/" + fn + "P" + str(percentage_increment) + ".discrete")
        array = []
        short_array = []
        """ Disabled names for quick view now
Пример #2
0
    def __init__(self, discrete_labels_fn, ppmi_fn, phrases_fn,
                 phrases_to_check_fn, fn):
        ppmi = dt.importLabels(ppmi_fn)
        ppmi = np.asarray(ppmi)
        phrases = dt.importString(phrases_fn)

        indexes_to_get = []
        if phrases_to_check_fn != "":
            phrases_to_check = dt.importString(phrases_to_check_fn)
            for pc in range(len(phrases_to_check)):
                for p in range(len(phrases)):
                    if phrases_to_check[pc] == phrases[p][6:]:
                        indexes_to_get.append(p)

        ppmi = ppmi.transpose()
        print len(ppmi), len(ppmi[0])
        counter = 0
        with open(discrete_labels_fn) as f:
            for line in f:
                exists = True
                if phrases_to_check_fn != "":
                    exists = False
                    for i in indexes_to_get:
                        if i == counter:
                            exists = True
                            break
                if exists:
                    discrete_labels = line.split()
                    saveGraph(discrete_labels, ppmi[counter],
                              fn + " " + phrases[counter][6:])
                    print phrases[counter]
                counter += 1
Пример #3
0
    def __init__(self, discrete_labels_fn, ppmi_fn, phrases_fn, phrases_to_check_fn, fn):
        ppmi = dt.importLabels(ppmi_fn)
        ppmi = np.asarray(ppmi)
        phrases = dt.importString(phrases_fn)

        indexes_to_get = []
        if phrases_to_check_fn != "":
            phrases_to_check = dt.importString(phrases_to_check_fn)
            for pc in range(len(phrases_to_check)):
                for p in range(len(phrases)):
                    if phrases_to_check[pc] == phrases[p][6:]:
                        indexes_to_get.append(p)

        ppmi = ppmi.transpose()
        print len(ppmi), len(ppmi[0])
        counter = 0
        with open(discrete_labels_fn) as f:
            for line in f:
                exists = True
                if phrases_to_check_fn != "":
                    exists = False
                    for i in indexes_to_get:
                        if i == counter:
                            exists = True
                            break
                if exists:
                    discrete_labels = line.split()
                    saveGraph(discrete_labels, ppmi[counter], fn + " " + phrases[counter][6:])
                    print phrases[counter]
                counter += 1
Пример #4
0
    def __init__(self, cluster_vectors_fn, cluster_labels_fn, movie_names_fn,
                 label_names_fn, cluster_names_fn, filename, training_data,
                 cluster_to_classify, max_depth):

        vectors = dt.importVectors(cluster_vectors_fn)
        labels = dt.importLabels(cluster_labels_fn)
        cluster_names = dt.importString(cluster_names_fn)
        vector_names = dt.importString(movie_names_fn)
        label_names = dt.importString(label_names_fn)
        scores_array = []
        for l in range(len(labels[0])):
            new_labels = [0] * 15000
            for x in range(len(labels)):
                new_labels[x] = labels[x][l]
            x_train = np.asarray(vectors[:training_data])
            x_test = np.asarray(vectors[training_data:])
            y_train = np.asarray(new_labels[:training_data])
            y_test = np.asarray(new_labels[training_data:])

            self.clf = tree.DecisionTreeClassifier(max_depth=max_depth)
            self.clf = self.clf.fit(x_train, y_train)

            y_pred = self.clf.predict(x_test)
            f1 = f1_score(y_test, y_pred, average='binary')
            accuracy = accuracy_score(y_test, y_pred)
            scores = [[label_names[l], "f1", f1, "accuracy", accuracy]]
            print scores[0]
            scores_array.append(scores)

            class_names = [label_names[l], "NOT " + label_names[l]]
            tree.export_graphviz(self.clf,
                                 feature_names=cluster_names,
                                 class_names=class_names,
                                 out_file='Rules/' + label_names[l] +
                                 filename + '.dot',
                                 max_depth=10)
            """
            rewrite_dot_file = dt.importString('Rules/'+filename+label_names[l]+'.dot')
            new_dot_file = []
            for s in rewrite_dot_file:
                new_string = s
                if "->" not in s and "digraph" not in s and "node" not in s and "(...)" not in s and "}" not in s:
                    index = s.index("value")
                    new_string = s[:index] + '"] ;'
                new_dot_file.append(new_string)
            dt.write1dArray(new_dot_file, 'Rules/Cleaned'+filename+label_names[l]+'.dot')
            """
            graph = pydot.graph_from_dot_file('Rules/' + label_names[l] +
                                              filename + '.dot')
            graph.write_png('Rules/Images/' + label_names[l] + filename +
                            ".png")
            self.get_code(self.clf, cluster_names, class_names,
                          label_names[l] + filename)
        dt.write1dArray(scores_array, 'Rules/Scores/' + filename + '.scores')
Пример #5
0
def makeKeywordPPMIVectors(file_name, common_keywords):
    print "?"
    file = open(file_name, "r")
    lines = file.readlines()
    last_film = ""
    movie_strings = dt.importString("filmdata/filmNames.txt")
    standard_strings = []
    for m in movie_strings:
        m = m[:-5]
        standard_strings.append(m.translate(None, string.punctuation).replace(" ", "").strip().upper())
    for line in lines:
        film_vectors = []
        line = line.strip()
        if len(line) > 2:
            line_split = re.split(r'\t+', line)
            line_split[0] = line_split[0].translate(None, string.punctuation).replace(" ", "").strip().upper()
            file_save = ""
            for m in range(len(standard_strings)):
                if standard_strings[m] == line_split[0]:
                    file_save = str(m)
                    break
            if file_save != "":
                file = open("filmdata\KeywordData\Movie_Most_Common_Keyword_Mapping\\" + file_save, "a")
                for keyword in common_keywords:
                    if line_split[2] == keyword.strip():
                        film_vectors.append("line")
                        file.write(keyword)
                        break
                if last_film.strip() != line_split[0].strip() and last_film is not None:
                    print "Succeeded", line_split[0]
                    file.close()

                last_film = line_split[0]
            else:
                print "Failed", line_split[0]
Пример #6
0
def findMissingKeywords(file_name, common_keywords):
    print "?"
    file = open(file_name, "r")
    lines = file.readlines()
    last_film = ""
    movie_strings = dt.importString("filmdata/filmNames.txt")
    standard_strings = []
    indexes = []
    for m in movie_strings:
        m = m[:-5]
        standard_strings.append(m.translate(None, string.punctuation).replace(" ", "").strip().upper())
    for line in lines:
        film_vectors = []
        line = line.strip()
        if len(line) > 2:
            line_split = re.split(r'\t+', line)
            line_split[0] = line_split[0].translate(None, string.punctuation).replace(" ", "").strip().upper()
            file_save = ""
            for m in range(len(standard_strings)):
                if standard_strings[m] == line_split[0]:
                    print "matched", m, standard_strings[m], line_split[0]
                    file_save = str(m)
                    break
            if file_save != "":
                if last_film.strip() != line_split[0].strip() and last_film is not None:
                    print "Succeeded", line_split[0]
                    for m in range(len(standard_strings)):
                        if standard_strings[m] == last_film:
                            indexes.append(m)
                            break
                last_film = line_split[0]
            else:
                print "Failed", line_split[0],
    dt.write1dArray(indexes, "filmdata/MISSING_FROM_MOVIEDATA.txt")
Пример #7
0
def outputKeywords():
    movie_strings = dt.importString("filmdata/filmNames.txt")
    movie_data = getMovieDataFromIMDB(movie_strings)
    commonality = 0
    common_keywords = getMostCommonKeywords(0, "filmdata/IMDB_movie_data.txt")
    dt.write1dArray(common_keywords, "filmdata/common_keywords_15k_commanility_" + str(commonality))
    vectors = getKeywordVectors(common_keywords, movie_strings, "")
    dt.write2dArray(vectors, "filmdata/classesKeywords/class-extra-all-commonality-" + str(commonality))
Пример #8
0
    def __init__(self, cluster_vectors_fn, cluster_labels_fn, movie_names_fn, label_names_fn, cluster_names_fn, filename, training_data, cluster_to_classify, max_depth):

        vectors = dt.importVectors(cluster_vectors_fn)
        labels = dt.importLabels(cluster_labels_fn)
        cluster_names = dt.importString(cluster_names_fn)
        vector_names = dt.importString(movie_names_fn)
        label_names = dt.importString(label_names_fn)
        scores_array = []
        for l in range(len(labels[0])):
            new_labels = [0] * 15000
            for x in range(len(labels)):
                new_labels[x] = labels[x][l]
            x_train = np.asarray(vectors[:training_data])
            x_test = np.asarray(vectors[training_data:])
            y_train = np.asarray(new_labels[:training_data])
            y_test = np.asarray(new_labels[training_data:])


            self.clf = tree.DecisionTreeClassifier( max_depth=max_depth)
            self.clf = self.clf.fit(x_train, y_train)

            y_pred = self.clf.predict(x_test)
            f1 = f1_score(y_test, y_pred, average='binary')
            accuracy = accuracy_score(y_test, y_pred)
            scores = [[label_names[l], "f1", f1, "accuracy", accuracy]]
            print scores[0]
            scores_array.append(scores)

            class_names = [ label_names[l], "NOT "+label_names[l]]
            tree.export_graphviz(self.clf, feature_names=cluster_names, class_names=class_names, out_file='Rules/'+label_names[l]+filename+'.dot', max_depth=10)
            """
            rewrite_dot_file = dt.importString('Rules/'+filename+label_names[l]+'.dot')
            new_dot_file = []
            for s in rewrite_dot_file:
                new_string = s
                if "->" not in s and "digraph" not in s and "node" not in s and "(...)" not in s and "}" not in s:
                    index = s.index("value")
                    new_string = s[:index] + '"] ;'
                new_dot_file.append(new_string)
            dt.write1dArray(new_dot_file, 'Rules/Cleaned'+filename+label_names[l]+'.dot')
            """
            graph = pydot.graph_from_dot_file('Rules/'+label_names[l]+filename+'.dot')
            graph.write_png('Rules/Images/'+label_names[l]+filename+".png")
            self.get_code(self.clf, cluster_names, class_names, label_names[l]+filename)
        dt.write1dArray(scores_array, 'Rules/Scores/'+filename+'.scores')
Пример #9
0
def getKNearestMovies(data, x, k):
    movie_names = dt.importString("filmdata/filmNames.txt")
    kd_tree = spatial.KDTree(data)
    kd_query = kd_tree.query(x=x, k=k)
    nearest_distances = kd_query[0][1:]
    k_nearest = kd_query[1][1:]
    nearest_movies = []
    for k in k_nearest:
        nearest_movies.append(movie_names[k].strip())
    print nearest_movies
    return nearest_movies, nearest_distances
Пример #10
0
def getKNearestMovies(data, x, k):
    movie_names = dt.importString("filmdata/filmNames.txt")
    kd_tree = spatial.KDTree(data)
    kd_query = kd_tree.query(x=x, k=k)
    nearest_distances = kd_query[0][1:]
    k_nearest = kd_query[1][1:]
    nearest_movies = []
    for k in k_nearest:
        nearest_movies.append(movie_names[k].strip())
    print nearest_movies
    return nearest_movies, nearest_distances
Пример #11
0
    def splitDirections(self, directions_fn, scores_fn, names_fn,
                        low_threshold, high_threshold):
        directions = dt.importVectors(directions_fn)
        scores = dt.importString(scores_fn)
        names = dt.importString(names_fn)

        for s in range(len(scores)):
            scores[s] = float(scores[s].strip())

        high_direction_indexes = []
        high_direction_scores = []
        low_direction_indexes = []
        low_direction_scores = []

        for s in range(len(scores)):
            if scores[s] >= high_threshold:
                high_direction_indexes.append(s)
                high_direction_scores.append(scores[s])
            elif scores[s] >= low_threshold:
                low_direction_indexes.append(s)
                low_direction_scores.append(scores[s])

        sorted_h_indexes = dt.sortByArray(high_direction_indexes,
                                          high_direction_scores)
        sorted_l_indexes = dt.sortByArray(low_direction_indexes,
                                          low_direction_scores)
        sorted_h_indexes.reverse()
        sorted_l_indexes.reverse()
        high_direction_names = []
        low_direction_names = []
        high_directions = []
        low_directions = []
        for s in sorted_h_indexes:
            high_directions.append(directions[s])
            high_direction_names.append(names[s][6:])
        for s in sorted_l_indexes:
            low_directions.append(directions[s])
            low_direction_names.append(names[s][6:])

        return high_direction_names, low_direction_names, high_directions, low_directions
Пример #12
0
    def __init__(self, directions_fn, vectors_fn, cluster_names_fn, vector_names_fn, fn, percent, percentage_increment, by_vector):

        directions = dt.importVectors(directions_fn)
        vectors = dt.importVectors(vectors_fn)
        cluster_names = dt.importString(cluster_names_fn)
        vector_names = dt.importString(vector_names_fn)

        rankings  = self.getRankings(directions, vectors, cluster_names, vector_names)
        rankings = np.array(rankings)
        #labels = self.createLabels(rankings, percent)
        #labels = np.asarray(labels)
        discrete_labels = self.createDiscreteLabels(rankings, percentage_increment)
        discrete_labels = np.asarray(discrete_labels)
        if by_vector:
            #labels = labels.transpose()
            discrete_labels = discrete_labels.transpose()
            rankings = rankings.transpose()
        #dt.write2dArray(labels, "Rankings/" + fn + "P" + str(percent) +".labels")
        dt.write2dArray(rankings, "Rankings/" + fn + ".space")
        dt.write2dArray(discrete_labels, "Rankings/" + fn + "P" + str(percentage_increment) + ".discrete")
        array = []
        short_array = []
        """ Disabled names for quick view now
Пример #13
0
    def splitDirections(self, directions_fn, scores_fn, names_fn, low_threshold, high_threshold):
        directions = dt.importVectors(directions_fn)
        scores = dt.importString(scores_fn)
        names = dt.importString(names_fn)

        for s in range(len(scores)):
            scores[s] = float(scores[s].strip())

        high_direction_indexes = []
        high_direction_scores = []
        low_direction_indexes = []
        low_direction_scores = []

        for s in range(len(scores)):
            if scores[s] >= high_threshold:
                high_direction_indexes.append(s)
                high_direction_scores.append(scores[s])
            elif scores[s] >= low_threshold:
                low_direction_indexes.append(s)
                low_direction_scores.append(scores[s])

        sorted_h_indexes = dt.sortByArray(high_direction_indexes,   high_direction_scores)
        sorted_l_indexes = dt.sortByArray(low_direction_indexes , low_direction_scores)
        sorted_h_indexes.reverse()
        sorted_l_indexes.reverse()
        high_direction_names = []
        low_direction_names = []
        high_directions = []
        low_directions = []
        for s in sorted_h_indexes:
            high_directions.append(directions[s])
            high_direction_names.append(names[s][6:])
        for s in sorted_l_indexes:
            low_directions.append(directions[s])
            low_direction_names.append(names[s][6:])

        return high_direction_names, low_direction_names, high_directions, low_directions
Пример #14
0
    def __init__(self, direction_fn, ppmi_fn, phrases_fn, phrases_to_check_fn, fn):
        ppmi = dt.importLabels(ppmi_fn)
        ppmi = np.asarray(ppmi)
        phrases = dt.importString(phrases_fn)

        indexes_to_get = []
        if phrases_to_check_fn != "":
            phrases_to_check = dt.importString(phrases_to_check_fn)
            for pc in range(len(phrases_to_check)):
                for p in range(len(phrases)):
                    if phrases_to_check[pc] == phrases[p][6:]:
                        indexes_to_get.append(p)
        indexes_to_get.sort()
        ppmi = ppmi.transpose()
        print len(ppmi), len(ppmi[0])
        scores = []
        pvalues = []
        scores_kendall = []
        pvalues_kendall = []
        agini = []
        agini1 = []
        angini1 = []
        angini = []
        amap = []
        andcg = []
        counter = 0
        averages = []
        with open(direction_fn) as f:
            for line in f:
                exists = True
                if phrases_to_check_fn != "":
                    exists = False
                    for i in indexes_to_get:
                        if i == counter:
                            exists = True
                            break
                if exists:
                    total = 0
                    amt = 0
                    direction = line.split()
                    for d in range(len(direction)):
                        direction[d] = float(direction[d])
                    new_direction = []
                    new_ppmi = []
                    direction_rank = np.argsort(direction)
                    ppmi_rank = np.argsort(ppmi[counter])
                    for d in range(len(ppmi[counter])):
                        if ppmi[counter][d] != 0:
                            total += ppmi[counter][d]
                            amt += 1
                            new_direction.append(direction_rank[d])
                            new_ppmi.append(ppmi_rank[d])
                    average = total / amt

                    min_max_scaler = preprocessing.MinMaxScaler()
                    normalized_ppmi = min_max_scaler.fit_transform(ppmi[counter])
                    normalized_dir = min_max_scaler.fit_transform(direction)

                    ginis = gini(normalized_ppmi, normalized_dir)

                    ranked_ppmi = dt.sortByArray(new_ppmi, new_direction)
                    nr_ppmi = min_max_scaler.fit_transform(ranked_ppmi)
                    ndcgs = ndcg_at_k(nr_ppmi, len(nr_ppmi))

                    #binarizer = preprocessing.Binarizer()
                    #binary_ppmi = binarizer.transform(normalized_ppmi)
                    #normalized_dir = np.ndarray.tolist(normalized_dir)
                    map = 0#average_precision_score(normalized_ppmi, normalized_dir)

                    rho, pvalue = spearmanr(new_ppmi, new_direction)
                    rhok, pvaluek = kendalltau(new_ppmi, new_direction)

                    scores.append(rho)
                    pvalues.append(pvalue)
                    scores_kendall.append(rhok)
                    pvalues_kendall.append(pvaluek)
                    andcg.append(ndcgs)
                    agini.append(ginis)
                    amap.append(map)
                    averages.append(average)
                    print phrases[counter] + ":", map, ginis

                counter += 1
        dt.write1dArray(scores, "RuleType/s" + fn + ".score")
        dt.write1dArray(pvalues, "RuleType/s" + fn + ".pvalue")
        dt.write1dArray(scores_kendall, "RuleType/k" + fn + ".score")
        dt.write1dArray(pvalues_kendall, "RuleType/k" + fn + ".pvalue")
        dt.write1dArray(phrases, "RuleType/" + fn + ".names")
        dt.write1dArray(averages, "RuleType/" + fn + ".averages")
        dt.write1dArray(agini, "RuleType/gn" + fn + ".score")
        dt.write1dArray(andcg, "RuleType/ndcg" + fn + ".score")
        dt.write1dArray(amap, "RuleType/map" + fn + ".score")
Пример #15
0
    def __init__(self, direction_fn, ppmi_fn, phrases_fn, phrases_to_check_fn,
                 fn):
        ppmi = dt.importLabels(ppmi_fn)
        ppmi = np.asarray(ppmi)
        phrases = dt.importString(phrases_fn)

        indexes_to_get = []
        if phrases_to_check_fn != "":
            phrases_to_check = dt.importString(phrases_to_check_fn)
            for pc in range(len(phrases_to_check)):
                for p in range(len(phrases)):
                    if phrases_to_check[pc] == phrases[p][6:]:
                        indexes_to_get.append(p)
        indexes_to_get.sort()
        ppmi = ppmi.transpose()
        print len(ppmi), len(ppmi[0])
        scores = []
        pvalues = []
        scores_kendall = []
        pvalues_kendall = []
        agini = []
        agini1 = []
        angini1 = []
        angini = []
        amap = []
        andcg = []
        counter = 0
        averages = []
        with open(direction_fn) as f:
            for line in f:
                exists = True
                if phrases_to_check_fn != "":
                    exists = False
                    for i in indexes_to_get:
                        if i == counter:
                            exists = True
                            break
                if exists:
                    total = 0
                    amt = 0
                    direction = line.split()
                    for d in range(len(direction)):
                        direction[d] = float(direction[d])
                    new_direction = []
                    new_ppmi = []
                    direction_rank = np.argsort(direction)
                    ppmi_rank = np.argsort(ppmi[counter])
                    for d in range(len(ppmi[counter])):
                        if ppmi[counter][d] != 0:
                            total += ppmi[counter][d]
                            amt += 1
                            new_direction.append(direction_rank[d])
                            new_ppmi.append(ppmi_rank[d])
                    average = total / amt

                    min_max_scaler = preprocessing.MinMaxScaler()
                    normalized_ppmi = min_max_scaler.fit_transform(
                        ppmi[counter])
                    normalized_dir = min_max_scaler.fit_transform(direction)

                    ginis = gini(normalized_ppmi, normalized_dir)

                    ranked_ppmi = dt.sortByArray(new_ppmi, new_direction)
                    nr_ppmi = min_max_scaler.fit_transform(ranked_ppmi)
                    ndcgs = ndcg_at_k(nr_ppmi, len(nr_ppmi))

                    #binarizer = preprocessing.Binarizer()
                    #binary_ppmi = binarizer.transform(normalized_ppmi)
                    #normalized_dir = np.ndarray.tolist(normalized_dir)
                    map = 0  #average_precision_score(normalized_ppmi, normalized_dir)

                    rho, pvalue = spearmanr(new_ppmi, new_direction)
                    rhok, pvaluek = kendalltau(new_ppmi, new_direction)

                    scores.append(rho)
                    pvalues.append(pvalue)
                    scores_kendall.append(rhok)
                    pvalues_kendall.append(pvaluek)
                    andcg.append(ndcgs)
                    agini.append(ginis)
                    amap.append(map)
                    averages.append(average)
                    print phrases[counter] + ":", map, ginis

                counter += 1
        dt.write1dArray(scores, "RuleType/s" + fn + ".score")
        dt.write1dArray(pvalues, "RuleType/s" + fn + ".pvalue")
        dt.write1dArray(scores_kendall, "RuleType/k" + fn + ".score")
        dt.write1dArray(pvalues_kendall, "RuleType/k" + fn + ".pvalue")
        dt.write1dArray(phrases, "RuleType/" + fn + ".names")
        dt.write1dArray(averages, "RuleType/" + fn + ".averages")
        dt.write1dArray(agini, "RuleType/gn" + fn + ".score")
        dt.write1dArray(andcg, "RuleType/ndcg" + fn + ".score")
        dt.write1dArray(amap, "RuleType/map" + fn + ".score")
Пример #16
0
    def __init__(self, direction_fn, ppmi_fn, phrases_fn, phrases_to_check_fn,
                 fn):
        ppmi = dt.importLabels(ppmi_fn)
        ppmi = np.asarray(ppmi)
        phrases = dt.importString(phrases_fn)

        indexes_to_get = []
        if phrases_to_check_fn != "":
            phrases_to_check = dt.importString(phrases_to_check_fn)
            for pc in range(len(phrases_to_check)):
                for p in range(len(phrases)):
                    if phrases_to_check[pc] == phrases[p][6:]:
                        indexes_to_get.append(p)

        ppmi = ppmi.transpose()
        print len(ppmi), len(ppmi[0])
        scores = []
        pvalues = []
        scores_kendall = []
        pvalues_kendall = []
        counter = 0
        averages = []
        with open(direction_fn) as f:
            for line in f:
                if indexes_to_get is not []:
                    for i in indexes_to_get:
                        if i == counter:
                            total = 0
                            amt = 0
                            direction = line.split()
                            for d in range(len(direction)):
                                direction[d] = float(direction[d])
                            new_direction = []
                            new_ppmi = []
                            direction_rank = np.argsort(direction)
                            ppmi_rank = np.argsort(ppmi[counter])
                            for d in range(len(ppmi[counter])):
                                if ppmi[counter][d] != 0:
                                    total += ppmi[counter][d]
                                    amt += 1
                                    new_direction.append(direction_rank[d])
                                    new_ppmi.append(ppmi_rank[d])
                            average = total / amt
                            rho, pvalue = spearmanr(new_ppmi, new_direction)
                            scores.append(rho)
                            pvalues.append(pvalue)
                            scores_kendall.append(rhok)
                            pvalues_kendall.append(pvaluek)
                            averages.append(average)
                            print phrases[counter] + ":", rho, pvalue, average
                else:
                    direction = line.split()
                    for d in range(len(direction)):
                        direction[d] = float(direction[d])
                    direction_rank = np.argsort(direction)
                    ppmi_rank = np.argsort(ppmi[counter])
                    rho, pvalue = spearmanr(direction_rank, ppmi_rank)
                    scores.append(rho)
                    pvalues.append(pvalue)
                    print phrases[counter] + ":", rho, pvalue
                counter += 1
        dt.write1dArray(scores, "RuleType/s" + fn + ".score")
        dt.write1dArray(pvalues, "RuleType/s" + fn + ".pvalue")
        dt.write1dArray(scores_kendall, "RuleType/k" + fn + ".score")
        dt.write1dArray(pvalues_kendall, "RuleType/k" + fn + ".pvalue")
        dt.write1dArray(phrases, "RuleType/" + fn + ".names")
        dt.write1dArray(averages, "RuleType/" + fn + ".averages")
Пример #17
0
    def __init__(self, direction_fn, ppmi_fn, phrases_fn, phrases_to_check_fn, fn):
        ppmi = dt.importLabels(ppmi_fn)
        ppmi = np.asarray(ppmi)
        phrases = dt.importString(phrases_fn)

        indexes_to_get = []
        if phrases_to_check_fn != "":
            phrases_to_check = dt.importString(phrases_to_check_fn)
            for pc in range(len(phrases_to_check)):
                for p in range(len(phrases)):
                    if phrases_to_check[pc] == phrases[p][6:]:
                        indexes_to_get.append(p)

        ppmi = ppmi.transpose()
        print len(ppmi), len(ppmi[0])
        scores = []
        pvalues = []
        scores_kendall = []
        pvalues_kendall = []
        counter = 0
        averages = []
        with open(direction_fn) as f:
            for line in f:
                if indexes_to_get is not []:
                    for i in indexes_to_get:
                        if i == counter:
                            total = 0
                            amt = 0
                            direction = line.split()
                            for d in range(len(direction)):
                                direction[d] = float(direction[d])
                            new_direction = []
                            new_ppmi = []
                            direction_rank = np.argsort(direction)
                            ppmi_rank = np.argsort(ppmi[counter])
                            for d in range(len(ppmi[counter])):
                                if ppmi[counter][d] != 0:
                                    total += ppmi[counter][d]
                                    amt += 1
                                    new_direction.append(direction_rank[d])
                                    new_ppmi.append(ppmi_rank[d])
                            average = total / amt
                            rho, pvalue = spearmanr(new_ppmi, new_direction)
                            scores.append(rho)
                            pvalues.append(pvalue)
                            scores_kendall.append(rhok)
                            pvalues_kendall.append(pvaluek)
                            averages.append(average)
                            print phrases[counter] + ":", rho, pvalue, average
                else:
                    direction = line.split()
                    for d in range(len(direction)):
                        direction[d] = float(direction[d])
                    direction_rank = np.argsort(direction)
                    ppmi_rank = np.argsort(ppmi[counter])
                    rho, pvalue = spearmanr(direction_rank, ppmi_rank)
                    scores.append(rho)
                    pvalues.append(pvalue)
                    print phrases[counter] + ":", rho, pvalue
                counter += 1
        dt.write1dArray(scores, "RuleType/s" + fn + ".score")
        dt.write1dArray(pvalues, "RuleType/s" + fn + ".pvalue")
        dt.write1dArray(scores_kendall, "RuleType/k" + fn + ".score")
        dt.write1dArray(pvalues_kendall, "RuleType/k" + fn + ".pvalue")
        dt.write1dArray(phrases, "RuleType/" + fn + ".names")
        dt.write1dArray(averages, "RuleType/" + fn + ".averages")
Пример #18
0
def outputPhrases():
    IDs = dt.importString("filmdata/filmIDs.txt")
    unique_phrases = dt.importString("filmdata/uniquePhrases.txt")
    vectors_maintained, vectors = getVectors(IDs, unique_phrases)
    dt.write2dArray(vectors, "filmdata/classesPhrases/class-all")
    dt.write2dArray(vectors_maintained, "filmdata/classesPhrases/nonbinary/class-all")