def __init__(self, directions_fn, vectors_fn, cluster_names_fn, vector_names_fn, fn, percent, percentage_increment, by_vector): directions = dt.importVectors(directions_fn) vectors = dt.importVectors(vectors_fn) cluster_names = dt.importString(cluster_names_fn) vector_names = dt.importString(vector_names_fn) rankings = self.getRankings(directions, vectors, cluster_names, vector_names) rankings = np.array(rankings) #labels = self.createLabels(rankings, percent) #labels = np.asarray(labels) discrete_labels = self.createDiscreteLabels(rankings, percentage_increment) discrete_labels = np.asarray(discrete_labels) if by_vector: #labels = labels.transpose() discrete_labels = discrete_labels.transpose() rankings = rankings.transpose() #dt.write2dArray(labels, "Rankings/" + fn + "P" + str(percent) +".labels") dt.write2dArray(rankings, "Rankings/" + fn + ".space") dt.write2dArray( discrete_labels, "Rankings/" + fn + "P" + str(percentage_increment) + ".discrete") array = [] short_array = [] """ Disabled names for quick view now
def __init__(self, discrete_labels_fn, ppmi_fn, phrases_fn, phrases_to_check_fn, fn): ppmi = dt.importLabels(ppmi_fn) ppmi = np.asarray(ppmi) phrases = dt.importString(phrases_fn) indexes_to_get = [] if phrases_to_check_fn != "": phrases_to_check = dt.importString(phrases_to_check_fn) for pc in range(len(phrases_to_check)): for p in range(len(phrases)): if phrases_to_check[pc] == phrases[p][6:]: indexes_to_get.append(p) ppmi = ppmi.transpose() print len(ppmi), len(ppmi[0]) counter = 0 with open(discrete_labels_fn) as f: for line in f: exists = True if phrases_to_check_fn != "": exists = False for i in indexes_to_get: if i == counter: exists = True break if exists: discrete_labels = line.split() saveGraph(discrete_labels, ppmi[counter], fn + " " + phrases[counter][6:]) print phrases[counter] counter += 1
def __init__(self, discrete_labels_fn, ppmi_fn, phrases_fn, phrases_to_check_fn, fn): ppmi = dt.importLabels(ppmi_fn) ppmi = np.asarray(ppmi) phrases = dt.importString(phrases_fn) indexes_to_get = [] if phrases_to_check_fn != "": phrases_to_check = dt.importString(phrases_to_check_fn) for pc in range(len(phrases_to_check)): for p in range(len(phrases)): if phrases_to_check[pc] == phrases[p][6:]: indexes_to_get.append(p) ppmi = ppmi.transpose() print len(ppmi), len(ppmi[0]) counter = 0 with open(discrete_labels_fn) as f: for line in f: exists = True if phrases_to_check_fn != "": exists = False for i in indexes_to_get: if i == counter: exists = True break if exists: discrete_labels = line.split() saveGraph(discrete_labels, ppmi[counter], fn + " " + phrases[counter][6:]) print phrases[counter] counter += 1
def __init__(self, cluster_vectors_fn, cluster_labels_fn, movie_names_fn, label_names_fn, cluster_names_fn, filename, training_data, cluster_to_classify, max_depth): vectors = dt.importVectors(cluster_vectors_fn) labels = dt.importLabels(cluster_labels_fn) cluster_names = dt.importString(cluster_names_fn) vector_names = dt.importString(movie_names_fn) label_names = dt.importString(label_names_fn) scores_array = [] for l in range(len(labels[0])): new_labels = [0] * 15000 for x in range(len(labels)): new_labels[x] = labels[x][l] x_train = np.asarray(vectors[:training_data]) x_test = np.asarray(vectors[training_data:]) y_train = np.asarray(new_labels[:training_data]) y_test = np.asarray(new_labels[training_data:]) self.clf = tree.DecisionTreeClassifier(max_depth=max_depth) self.clf = self.clf.fit(x_train, y_train) y_pred = self.clf.predict(x_test) f1 = f1_score(y_test, y_pred, average='binary') accuracy = accuracy_score(y_test, y_pred) scores = [[label_names[l], "f1", f1, "accuracy", accuracy]] print scores[0] scores_array.append(scores) class_names = [label_names[l], "NOT " + label_names[l]] tree.export_graphviz(self.clf, feature_names=cluster_names, class_names=class_names, out_file='Rules/' + label_names[l] + filename + '.dot', max_depth=10) """ rewrite_dot_file = dt.importString('Rules/'+filename+label_names[l]+'.dot') new_dot_file = [] for s in rewrite_dot_file: new_string = s if "->" not in s and "digraph" not in s and "node" not in s and "(...)" not in s and "}" not in s: index = s.index("value") new_string = s[:index] + '"] ;' new_dot_file.append(new_string) dt.write1dArray(new_dot_file, 'Rules/Cleaned'+filename+label_names[l]+'.dot') """ graph = pydot.graph_from_dot_file('Rules/' + label_names[l] + filename + '.dot') graph.write_png('Rules/Images/' + label_names[l] + filename + ".png") self.get_code(self.clf, cluster_names, class_names, label_names[l] + filename) dt.write1dArray(scores_array, 'Rules/Scores/' + filename + '.scores')
def makeKeywordPPMIVectors(file_name, common_keywords): print "?" file = open(file_name, "r") lines = file.readlines() last_film = "" movie_strings = dt.importString("filmdata/filmNames.txt") standard_strings = [] for m in movie_strings: m = m[:-5] standard_strings.append(m.translate(None, string.punctuation).replace(" ", "").strip().upper()) for line in lines: film_vectors = [] line = line.strip() if len(line) > 2: line_split = re.split(r'\t+', line) line_split[0] = line_split[0].translate(None, string.punctuation).replace(" ", "").strip().upper() file_save = "" for m in range(len(standard_strings)): if standard_strings[m] == line_split[0]: file_save = str(m) break if file_save != "": file = open("filmdata\KeywordData\Movie_Most_Common_Keyword_Mapping\\" + file_save, "a") for keyword in common_keywords: if line_split[2] == keyword.strip(): film_vectors.append("line") file.write(keyword) break if last_film.strip() != line_split[0].strip() and last_film is not None: print "Succeeded", line_split[0] file.close() last_film = line_split[0] else: print "Failed", line_split[0]
def findMissingKeywords(file_name, common_keywords): print "?" file = open(file_name, "r") lines = file.readlines() last_film = "" movie_strings = dt.importString("filmdata/filmNames.txt") standard_strings = [] indexes = [] for m in movie_strings: m = m[:-5] standard_strings.append(m.translate(None, string.punctuation).replace(" ", "").strip().upper()) for line in lines: film_vectors = [] line = line.strip() if len(line) > 2: line_split = re.split(r'\t+', line) line_split[0] = line_split[0].translate(None, string.punctuation).replace(" ", "").strip().upper() file_save = "" for m in range(len(standard_strings)): if standard_strings[m] == line_split[0]: print "matched", m, standard_strings[m], line_split[0] file_save = str(m) break if file_save != "": if last_film.strip() != line_split[0].strip() and last_film is not None: print "Succeeded", line_split[0] for m in range(len(standard_strings)): if standard_strings[m] == last_film: indexes.append(m) break last_film = line_split[0] else: print "Failed", line_split[0], dt.write1dArray(indexes, "filmdata/MISSING_FROM_MOVIEDATA.txt")
def outputKeywords(): movie_strings = dt.importString("filmdata/filmNames.txt") movie_data = getMovieDataFromIMDB(movie_strings) commonality = 0 common_keywords = getMostCommonKeywords(0, "filmdata/IMDB_movie_data.txt") dt.write1dArray(common_keywords, "filmdata/common_keywords_15k_commanility_" + str(commonality)) vectors = getKeywordVectors(common_keywords, movie_strings, "") dt.write2dArray(vectors, "filmdata/classesKeywords/class-extra-all-commonality-" + str(commonality))
def __init__(self, cluster_vectors_fn, cluster_labels_fn, movie_names_fn, label_names_fn, cluster_names_fn, filename, training_data, cluster_to_classify, max_depth): vectors = dt.importVectors(cluster_vectors_fn) labels = dt.importLabels(cluster_labels_fn) cluster_names = dt.importString(cluster_names_fn) vector_names = dt.importString(movie_names_fn) label_names = dt.importString(label_names_fn) scores_array = [] for l in range(len(labels[0])): new_labels = [0] * 15000 for x in range(len(labels)): new_labels[x] = labels[x][l] x_train = np.asarray(vectors[:training_data]) x_test = np.asarray(vectors[training_data:]) y_train = np.asarray(new_labels[:training_data]) y_test = np.asarray(new_labels[training_data:]) self.clf = tree.DecisionTreeClassifier( max_depth=max_depth) self.clf = self.clf.fit(x_train, y_train) y_pred = self.clf.predict(x_test) f1 = f1_score(y_test, y_pred, average='binary') accuracy = accuracy_score(y_test, y_pred) scores = [[label_names[l], "f1", f1, "accuracy", accuracy]] print scores[0] scores_array.append(scores) class_names = [ label_names[l], "NOT "+label_names[l]] tree.export_graphviz(self.clf, feature_names=cluster_names, class_names=class_names, out_file='Rules/'+label_names[l]+filename+'.dot', max_depth=10) """ rewrite_dot_file = dt.importString('Rules/'+filename+label_names[l]+'.dot') new_dot_file = [] for s in rewrite_dot_file: new_string = s if "->" not in s and "digraph" not in s and "node" not in s and "(...)" not in s and "}" not in s: index = s.index("value") new_string = s[:index] + '"] ;' new_dot_file.append(new_string) dt.write1dArray(new_dot_file, 'Rules/Cleaned'+filename+label_names[l]+'.dot') """ graph = pydot.graph_from_dot_file('Rules/'+label_names[l]+filename+'.dot') graph.write_png('Rules/Images/'+label_names[l]+filename+".png") self.get_code(self.clf, cluster_names, class_names, label_names[l]+filename) dt.write1dArray(scores_array, 'Rules/Scores/'+filename+'.scores')
def getKNearestMovies(data, x, k): movie_names = dt.importString("filmdata/filmNames.txt") kd_tree = spatial.KDTree(data) kd_query = kd_tree.query(x=x, k=k) nearest_distances = kd_query[0][1:] k_nearest = kd_query[1][1:] nearest_movies = [] for k in k_nearest: nearest_movies.append(movie_names[k].strip()) print nearest_movies return nearest_movies, nearest_distances
def getKNearestMovies(data, x, k): movie_names = dt.importString("filmdata/filmNames.txt") kd_tree = spatial.KDTree(data) kd_query = kd_tree.query(x=x, k=k) nearest_distances = kd_query[0][1:] k_nearest = kd_query[1][1:] nearest_movies = [] for k in k_nearest: nearest_movies.append(movie_names[k].strip()) print nearest_movies return nearest_movies, nearest_distances
def splitDirections(self, directions_fn, scores_fn, names_fn, low_threshold, high_threshold): directions = dt.importVectors(directions_fn) scores = dt.importString(scores_fn) names = dt.importString(names_fn) for s in range(len(scores)): scores[s] = float(scores[s].strip()) high_direction_indexes = [] high_direction_scores = [] low_direction_indexes = [] low_direction_scores = [] for s in range(len(scores)): if scores[s] >= high_threshold: high_direction_indexes.append(s) high_direction_scores.append(scores[s]) elif scores[s] >= low_threshold: low_direction_indexes.append(s) low_direction_scores.append(scores[s]) sorted_h_indexes = dt.sortByArray(high_direction_indexes, high_direction_scores) sorted_l_indexes = dt.sortByArray(low_direction_indexes, low_direction_scores) sorted_h_indexes.reverse() sorted_l_indexes.reverse() high_direction_names = [] low_direction_names = [] high_directions = [] low_directions = [] for s in sorted_h_indexes: high_directions.append(directions[s]) high_direction_names.append(names[s][6:]) for s in sorted_l_indexes: low_directions.append(directions[s]) low_direction_names.append(names[s][6:]) return high_direction_names, low_direction_names, high_directions, low_directions
def __init__(self, directions_fn, vectors_fn, cluster_names_fn, vector_names_fn, fn, percent, percentage_increment, by_vector): directions = dt.importVectors(directions_fn) vectors = dt.importVectors(vectors_fn) cluster_names = dt.importString(cluster_names_fn) vector_names = dt.importString(vector_names_fn) rankings = self.getRankings(directions, vectors, cluster_names, vector_names) rankings = np.array(rankings) #labels = self.createLabels(rankings, percent) #labels = np.asarray(labels) discrete_labels = self.createDiscreteLabels(rankings, percentage_increment) discrete_labels = np.asarray(discrete_labels) if by_vector: #labels = labels.transpose() discrete_labels = discrete_labels.transpose() rankings = rankings.transpose() #dt.write2dArray(labels, "Rankings/" + fn + "P" + str(percent) +".labels") dt.write2dArray(rankings, "Rankings/" + fn + ".space") dt.write2dArray(discrete_labels, "Rankings/" + fn + "P" + str(percentage_increment) + ".discrete") array = [] short_array = [] """ Disabled names for quick view now
def splitDirections(self, directions_fn, scores_fn, names_fn, low_threshold, high_threshold): directions = dt.importVectors(directions_fn) scores = dt.importString(scores_fn) names = dt.importString(names_fn) for s in range(len(scores)): scores[s] = float(scores[s].strip()) high_direction_indexes = [] high_direction_scores = [] low_direction_indexes = [] low_direction_scores = [] for s in range(len(scores)): if scores[s] >= high_threshold: high_direction_indexes.append(s) high_direction_scores.append(scores[s]) elif scores[s] >= low_threshold: low_direction_indexes.append(s) low_direction_scores.append(scores[s]) sorted_h_indexes = dt.sortByArray(high_direction_indexes, high_direction_scores) sorted_l_indexes = dt.sortByArray(low_direction_indexes , low_direction_scores) sorted_h_indexes.reverse() sorted_l_indexes.reverse() high_direction_names = [] low_direction_names = [] high_directions = [] low_directions = [] for s in sorted_h_indexes: high_directions.append(directions[s]) high_direction_names.append(names[s][6:]) for s in sorted_l_indexes: low_directions.append(directions[s]) low_direction_names.append(names[s][6:]) return high_direction_names, low_direction_names, high_directions, low_directions
def __init__(self, direction_fn, ppmi_fn, phrases_fn, phrases_to_check_fn, fn): ppmi = dt.importLabels(ppmi_fn) ppmi = np.asarray(ppmi) phrases = dt.importString(phrases_fn) indexes_to_get = [] if phrases_to_check_fn != "": phrases_to_check = dt.importString(phrases_to_check_fn) for pc in range(len(phrases_to_check)): for p in range(len(phrases)): if phrases_to_check[pc] == phrases[p][6:]: indexes_to_get.append(p) indexes_to_get.sort() ppmi = ppmi.transpose() print len(ppmi), len(ppmi[0]) scores = [] pvalues = [] scores_kendall = [] pvalues_kendall = [] agini = [] agini1 = [] angini1 = [] angini = [] amap = [] andcg = [] counter = 0 averages = [] with open(direction_fn) as f: for line in f: exists = True if phrases_to_check_fn != "": exists = False for i in indexes_to_get: if i == counter: exists = True break if exists: total = 0 amt = 0 direction = line.split() for d in range(len(direction)): direction[d] = float(direction[d]) new_direction = [] new_ppmi = [] direction_rank = np.argsort(direction) ppmi_rank = np.argsort(ppmi[counter]) for d in range(len(ppmi[counter])): if ppmi[counter][d] != 0: total += ppmi[counter][d] amt += 1 new_direction.append(direction_rank[d]) new_ppmi.append(ppmi_rank[d]) average = total / amt min_max_scaler = preprocessing.MinMaxScaler() normalized_ppmi = min_max_scaler.fit_transform(ppmi[counter]) normalized_dir = min_max_scaler.fit_transform(direction) ginis = gini(normalized_ppmi, normalized_dir) ranked_ppmi = dt.sortByArray(new_ppmi, new_direction) nr_ppmi = min_max_scaler.fit_transform(ranked_ppmi) ndcgs = ndcg_at_k(nr_ppmi, len(nr_ppmi)) #binarizer = preprocessing.Binarizer() #binary_ppmi = binarizer.transform(normalized_ppmi) #normalized_dir = np.ndarray.tolist(normalized_dir) map = 0#average_precision_score(normalized_ppmi, normalized_dir) rho, pvalue = spearmanr(new_ppmi, new_direction) rhok, pvaluek = kendalltau(new_ppmi, new_direction) scores.append(rho) pvalues.append(pvalue) scores_kendall.append(rhok) pvalues_kendall.append(pvaluek) andcg.append(ndcgs) agini.append(ginis) amap.append(map) averages.append(average) print phrases[counter] + ":", map, ginis counter += 1 dt.write1dArray(scores, "RuleType/s" + fn + ".score") dt.write1dArray(pvalues, "RuleType/s" + fn + ".pvalue") dt.write1dArray(scores_kendall, "RuleType/k" + fn + ".score") dt.write1dArray(pvalues_kendall, "RuleType/k" + fn + ".pvalue") dt.write1dArray(phrases, "RuleType/" + fn + ".names") dt.write1dArray(averages, "RuleType/" + fn + ".averages") dt.write1dArray(agini, "RuleType/gn" + fn + ".score") dt.write1dArray(andcg, "RuleType/ndcg" + fn + ".score") dt.write1dArray(amap, "RuleType/map" + fn + ".score")
def __init__(self, direction_fn, ppmi_fn, phrases_fn, phrases_to_check_fn, fn): ppmi = dt.importLabels(ppmi_fn) ppmi = np.asarray(ppmi) phrases = dt.importString(phrases_fn) indexes_to_get = [] if phrases_to_check_fn != "": phrases_to_check = dt.importString(phrases_to_check_fn) for pc in range(len(phrases_to_check)): for p in range(len(phrases)): if phrases_to_check[pc] == phrases[p][6:]: indexes_to_get.append(p) indexes_to_get.sort() ppmi = ppmi.transpose() print len(ppmi), len(ppmi[0]) scores = [] pvalues = [] scores_kendall = [] pvalues_kendall = [] agini = [] agini1 = [] angini1 = [] angini = [] amap = [] andcg = [] counter = 0 averages = [] with open(direction_fn) as f: for line in f: exists = True if phrases_to_check_fn != "": exists = False for i in indexes_to_get: if i == counter: exists = True break if exists: total = 0 amt = 0 direction = line.split() for d in range(len(direction)): direction[d] = float(direction[d]) new_direction = [] new_ppmi = [] direction_rank = np.argsort(direction) ppmi_rank = np.argsort(ppmi[counter]) for d in range(len(ppmi[counter])): if ppmi[counter][d] != 0: total += ppmi[counter][d] amt += 1 new_direction.append(direction_rank[d]) new_ppmi.append(ppmi_rank[d]) average = total / amt min_max_scaler = preprocessing.MinMaxScaler() normalized_ppmi = min_max_scaler.fit_transform( ppmi[counter]) normalized_dir = min_max_scaler.fit_transform(direction) ginis = gini(normalized_ppmi, normalized_dir) ranked_ppmi = dt.sortByArray(new_ppmi, new_direction) nr_ppmi = min_max_scaler.fit_transform(ranked_ppmi) ndcgs = ndcg_at_k(nr_ppmi, len(nr_ppmi)) #binarizer = preprocessing.Binarizer() #binary_ppmi = binarizer.transform(normalized_ppmi) #normalized_dir = np.ndarray.tolist(normalized_dir) map = 0 #average_precision_score(normalized_ppmi, normalized_dir) rho, pvalue = spearmanr(new_ppmi, new_direction) rhok, pvaluek = kendalltau(new_ppmi, new_direction) scores.append(rho) pvalues.append(pvalue) scores_kendall.append(rhok) pvalues_kendall.append(pvaluek) andcg.append(ndcgs) agini.append(ginis) amap.append(map) averages.append(average) print phrases[counter] + ":", map, ginis counter += 1 dt.write1dArray(scores, "RuleType/s" + fn + ".score") dt.write1dArray(pvalues, "RuleType/s" + fn + ".pvalue") dt.write1dArray(scores_kendall, "RuleType/k" + fn + ".score") dt.write1dArray(pvalues_kendall, "RuleType/k" + fn + ".pvalue") dt.write1dArray(phrases, "RuleType/" + fn + ".names") dt.write1dArray(averages, "RuleType/" + fn + ".averages") dt.write1dArray(agini, "RuleType/gn" + fn + ".score") dt.write1dArray(andcg, "RuleType/ndcg" + fn + ".score") dt.write1dArray(amap, "RuleType/map" + fn + ".score")
def __init__(self, direction_fn, ppmi_fn, phrases_fn, phrases_to_check_fn, fn): ppmi = dt.importLabels(ppmi_fn) ppmi = np.asarray(ppmi) phrases = dt.importString(phrases_fn) indexes_to_get = [] if phrases_to_check_fn != "": phrases_to_check = dt.importString(phrases_to_check_fn) for pc in range(len(phrases_to_check)): for p in range(len(phrases)): if phrases_to_check[pc] == phrases[p][6:]: indexes_to_get.append(p) ppmi = ppmi.transpose() print len(ppmi), len(ppmi[0]) scores = [] pvalues = [] scores_kendall = [] pvalues_kendall = [] counter = 0 averages = [] with open(direction_fn) as f: for line in f: if indexes_to_get is not []: for i in indexes_to_get: if i == counter: total = 0 amt = 0 direction = line.split() for d in range(len(direction)): direction[d] = float(direction[d]) new_direction = [] new_ppmi = [] direction_rank = np.argsort(direction) ppmi_rank = np.argsort(ppmi[counter]) for d in range(len(ppmi[counter])): if ppmi[counter][d] != 0: total += ppmi[counter][d] amt += 1 new_direction.append(direction_rank[d]) new_ppmi.append(ppmi_rank[d]) average = total / amt rho, pvalue = spearmanr(new_ppmi, new_direction) scores.append(rho) pvalues.append(pvalue) scores_kendall.append(rhok) pvalues_kendall.append(pvaluek) averages.append(average) print phrases[counter] + ":", rho, pvalue, average else: direction = line.split() for d in range(len(direction)): direction[d] = float(direction[d]) direction_rank = np.argsort(direction) ppmi_rank = np.argsort(ppmi[counter]) rho, pvalue = spearmanr(direction_rank, ppmi_rank) scores.append(rho) pvalues.append(pvalue) print phrases[counter] + ":", rho, pvalue counter += 1 dt.write1dArray(scores, "RuleType/s" + fn + ".score") dt.write1dArray(pvalues, "RuleType/s" + fn + ".pvalue") dt.write1dArray(scores_kendall, "RuleType/k" + fn + ".score") dt.write1dArray(pvalues_kendall, "RuleType/k" + fn + ".pvalue") dt.write1dArray(phrases, "RuleType/" + fn + ".names") dt.write1dArray(averages, "RuleType/" + fn + ".averages")
def __init__(self, direction_fn, ppmi_fn, phrases_fn, phrases_to_check_fn, fn): ppmi = dt.importLabels(ppmi_fn) ppmi = np.asarray(ppmi) phrases = dt.importString(phrases_fn) indexes_to_get = [] if phrases_to_check_fn != "": phrases_to_check = dt.importString(phrases_to_check_fn) for pc in range(len(phrases_to_check)): for p in range(len(phrases)): if phrases_to_check[pc] == phrases[p][6:]: indexes_to_get.append(p) ppmi = ppmi.transpose() print len(ppmi), len(ppmi[0]) scores = [] pvalues = [] scores_kendall = [] pvalues_kendall = [] counter = 0 averages = [] with open(direction_fn) as f: for line in f: if indexes_to_get is not []: for i in indexes_to_get: if i == counter: total = 0 amt = 0 direction = line.split() for d in range(len(direction)): direction[d] = float(direction[d]) new_direction = [] new_ppmi = [] direction_rank = np.argsort(direction) ppmi_rank = np.argsort(ppmi[counter]) for d in range(len(ppmi[counter])): if ppmi[counter][d] != 0: total += ppmi[counter][d] amt += 1 new_direction.append(direction_rank[d]) new_ppmi.append(ppmi_rank[d]) average = total / amt rho, pvalue = spearmanr(new_ppmi, new_direction) scores.append(rho) pvalues.append(pvalue) scores_kendall.append(rhok) pvalues_kendall.append(pvaluek) averages.append(average) print phrases[counter] + ":", rho, pvalue, average else: direction = line.split() for d in range(len(direction)): direction[d] = float(direction[d]) direction_rank = np.argsort(direction) ppmi_rank = np.argsort(ppmi[counter]) rho, pvalue = spearmanr(direction_rank, ppmi_rank) scores.append(rho) pvalues.append(pvalue) print phrases[counter] + ":", rho, pvalue counter += 1 dt.write1dArray(scores, "RuleType/s" + fn + ".score") dt.write1dArray(pvalues, "RuleType/s" + fn + ".pvalue") dt.write1dArray(scores_kendall, "RuleType/k" + fn + ".score") dt.write1dArray(pvalues_kendall, "RuleType/k" + fn + ".pvalue") dt.write1dArray(phrases, "RuleType/" + fn + ".names") dt.write1dArray(averages, "RuleType/" + fn + ".averages")
def outputPhrases(): IDs = dt.importString("filmdata/filmIDs.txt") unique_phrases = dt.importString("filmdata/uniquePhrases.txt") vectors_maintained, vectors = getVectors(IDs, unique_phrases) dt.write2dArray(vectors, "filmdata/classesPhrases/class-all") dt.write2dArray(vectors_maintained, "filmdata/classesPhrases/nonbinary/class-all")