def getScoreDifferences(name_word_file1, name_score_file1, name_word_file2, name_score_file2, name, data_type): scores1 = import1dArray(name_score_file1, "f") scores2 = import1dArray(name_score_file2, "f") words1 = import1dArray(name_word_file1, "s") words2 = import1dArray(name_word_file2, "s") differences_list = [] if len(words1) > len(words2): same_element_index = getIndexOfCommonElements(words2, words1) scores1 = np.asarray(scores1)[same_element_index] words1 = np.asarray(words1)[same_element_index] else: same_element_index = getIndexOfCommonElements(words1, words2) scores2 = np.asarray(scores2)[same_element_index] words2 = np.asarray(words2)[same_element_index] for i in range(len(scores1)): differences_list.append(scores1[i] - scores2[i]) most_different_words = [ x for (y, x) in sorted(zip(differences_list, words1)) ] differences_list = sorted(differences_list) write1dArray( most_different_words, "../data/" + data_type + "/SVM/difference/most_different_words_" + name + ".txt") write1dArray( differences_list, "../data/" + data_type + "/SVM/difference/most_different_values_" + name + ".txt")
def getDifference(array1, array2): file2 = import1dArray(array1) file1 = import1dArray(array2) for line1 in file1: found = False for line2 in file2: if line2 == line1: found = True break if not found: print(line1)
def writeClassAll(class_fn, full_phrases_fn, phrases_used_fn, file_name): full_phrases = import1dArray(full_phrases_fn) #ppmi = np.asarray(import2dArray(class_fn)).transpose() ppmi = import2dArray(class_fn) new_ppmi = [] phrases_used = import1dArray(phrases_used_fn) for p in range(len(full_phrases)): for pi in range(len(phrases_used)): if full_phrases[p] == phrases_used[pi]: new_ppmi.append(ppmi[p]) break write2dArray(new_ppmi, file_name)
def getScores(names, full_scores, full_names, file_name, data_type): full_scores = import1dArray(full_scores) full_names = import1dArray(full_names) names = import1dArray(names) final_scores = [] for j in range(len(names)): for i in range(len(full_names)): if names[j] == full_names[i]: final_scores.append(full_scores[i]) break write1dArray(final_scores, "../data/" + data_type + "/bow/scores/" + file_name + ".txt") return "../data/" + data_type + "/bow/scores/" + file_name + ".txt"
def compileSVMResults(file_name, chunk_amt, data_type): if fileExists("../data/" + data_type + "/svm/directions/" + file_name + ".txt") is False: print("Compiling SVM results") randomcount = 0 directions = [] for c in range(chunk_amt): directions.append("../data/" + data_type + "/svm/directions/" + file_name + " CID" + str(c) + " CAMT" + str(chunk_amt) + ".txt") kappa = [] for c in range(chunk_amt): kappa.append("../data/" + data_type + "/svm/kappa/" + file_name + " CID" + str(c) + " CAMT" + str(chunk_amt) + ".txt") for f in directions: while not fileExists(f): time.sleep(10) time.sleep(10) di = [] for d in directions: di.extend(import2dArray(d)) ka = [] for k in kappa: ka.extend(import1dArray(k)) write2dArray( di, "../data/" + data_type + "/svm/directions/" + file_name + ".txt") write1dArray( ka, "../data/" + data_type + "/svm/kappa/" + file_name + ".txt") else: print("Skipping compile")
def convertToPPMI(freq_arrays_fn, term_names_fn): freq_arrays = np.asarray(import2dArray(freq_arrays_fn, "i")) term_names = import1dArray(term_names_fn) ppmi_arrays = [] overall = 0.0 for f in freq_arrays: overall += sum(f) entity_array = [0] * 15000 # For each term for t in range(len(freq_arrays)): ppmi_array = [] term = sum(freq_arrays[t, :]) term_p = term / overall for e in range(len(freq_arrays[t])): ppmi = 0.0 freq = freq_arrays[t][e] if freq != 0: freq_p = freq / overall if entity_array[e] == 0: entity = sum(freq_arrays[:, e]) entity_p = entity / overall entity_array[e] = entity_p proba = freq_p / (entity_array[e] * term_p) ppmi = np.amax([0.0, np.log(proba)]) ppmi_array.append(ppmi) print(ppmi_array) ppmi_arrays.append(ppmi_array) write1dArray(ppmi_array, "../data/movies/bow/ppmi/class-" + term_names[t]) write2dArray(ppmi_arrays, "../data/movies/bow/ppmi/class-all")
def getWordVectors(vector_save_fn, words_fn, wvn, wv_amt, svm_dir_fn=None): if os.path.exists(vector_save_fn) is False: glove_file = datapath('/home/tom/Downloads/glove.6B/glove.6B.' + str(wv_amt) + 'd.txt') tmp_file = get_tmpfile( "/home/tom/Downloads/glove.6B/test_word2vec.txt") glove2word2vec(glove_file, tmp_file) svm_dir = import2dArray(svm_dir_fn) all_vectors = KeyedVectors.load_word2vec_format(tmp_file) vectors = [] words = import1dArray(words_fn) for w in range(len(words)): try: if svm_dir_fn is None: vectors.append(all_vectors.get_vector(words[w])) else: vectors.append( np.concatenate( [all_vectors.get_vector(words[w]), svm_dir[w]])) except KeyError: if svm_dir_fn is None: vectors.append(np.zeros(wv_amt)) else: vectors.append(np.zeros(wv_amt + len(svm_dir[0]))) write2dArray(vectors, vector_save_fn) write1dArray(words, wvn) else: print("Already got word vectors", vector_save_fn)
def countClassFrequences(data_type, class_name): class_all = import2dArray("../data/" + data_type + "/classify/" + class_name + "/class-all") class_names = import1dArray("../data/" + data_type + "/classify/" + class_name + "/names.txt") counts = [] class_all = np.asarray(class_all).transpose() for i in range(len(class_all)): count = len(np.nonzero(class_all[i])[0]) print(class_names[i], count) counts.append(count)
def obtainKappaOnClusteredDirection(names, ranks): # For each discrete rank, obtain the Kappa score compared to the word occ kappas = np.empty(len(names)) for n in range(len(names)): clf = svm.LinearSVC() ppmi = np.asarray( import1dArray("../data/movies/bow/binary/phrases/" + names[n], "i")) clf.fit(ranks, ppmi) y_pred = clf.predict(ranks) score = cohen_kappa_score(ppmi, y_pred) kappas[n] = score return kappas
def getNonZero(class_names_fn, file_name): class_names = import1dArray(class_names_fn, "s") class_all = np.asarray(import2dArray(file_name)).transpose() for c in range(len(class_all)): print(np.count_nonzero(class_all[c]))
def writeIndividualClasses(overall_class_fn, names_fn, output_filename): overall_class = import2dArray(overall_class_fn, "f") names = import1dArray(names_fn) for n in range(len(names)): write1dArray(overall_class[n], output_filename + "class-" + names[n]) print(names[n])
def remove_indexes(indexes, array_fn): array = np.asarray(import1dArray(array_fn)) array = np.delete(array, indexes, axis=0) write1dArray(array, array_fn) print("wrote", array_fn)
return classes.transpose(), class_names """ """ if __name__ == '__main__': """ #countClassFrequences("reuters", "topics") class_fn = "../data/movies/classify/keywords/class-all" class_name_fn = "../data/movies/classify/keywords/names.txt" classes = import2dArray(class_fn) class_names = import1dArray(class_name_fn) classes, class_names = removeInfrequent(classes, class_names) """ words = import1dArray("../data/placetypes/bow/names/5-1-all.txt", "s") word_dict = {} for i in range(len(words)): word_dict[i] = words[i] averageWordVectorsFreq( word_dict, "../data/placetypes/bow/frequency/phrases/class-all-5-1-all", 200, "placetypes") averageWordVectorsFreq( word_dict, "../data/placetypes/bow/frequency/phrases/class-all-5-1-all", 100, "placetypes") averageWordVectorsFreq( word_dict, "../data/placetypes/bow/frequency/phrases/class-all-5-1-all", 50,
from util import io def parameter_list_to_dict_str(parameter_list_string): # dict_str = ["param_dict = {"] for i in range(len(parameter_list_string)): str = "" if parameter_list_string[i][:1] == "#": continue else: split = parameter_list_string[i].split() if len(split) == 0: continue str += "\t'" + split[0] + "': " + split[0] + "," dict_str.append(str) dict_str.append("}") return dict_str if __name__ == '__main__': parameter_list_string = io.import1dArray( "../../data/parameter_list_string.txt") parameter_dict = parameter_list_to_dict_str(parameter_list_string) io.write1dArray(parameter_dict, "../../data/parameter_dict.txt")