def getNamesFromDict(dict_fn, file_name):
    new_dict = import2dArray(dict_fn, "s")
    names = []
    for d in range(len(new_dict)):
        names.append(new_dict[d][0].strip())
    write1dArray(
        names, "../data/movies/cluster/hierarchy_names/" + file_name + ".txt")
def compileSVMResults(file_name, chunk_amt, data_type):
    if fileExists("../data/" + data_type + "/svm/directions/" + file_name +
                  ".txt") is False:
        print("Compiling SVM results")
        randomcount = 0
        directions = []
        for c in range(chunk_amt):
            directions.append("../data/" + data_type + "/svm/directions/" +
                              file_name + " CID" + str(c) + " CAMT" +
                              str(chunk_amt) + ".txt")
        kappa = []
        for c in range(chunk_amt):
            kappa.append("../data/" + data_type + "/svm/kappa/" + file_name +
                         " CID" + str(c) + " CAMT" + str(chunk_amt) + ".txt")
        for f in directions:
            while not fileExists(f):
                time.sleep(10)
        time.sleep(10)
        di = []
        for d in directions:
            di.extend(import2dArray(d))
        ka = []
        for k in kappa:
            ka.extend(import1dArray(k))
        write2dArray(
            di,
            "../data/" + data_type + "/svm/directions/" + file_name + ".txt")
        write1dArray(
            ka, "../data/" + data_type + "/svm/kappa/" + file_name + ".txt")
    else:
        print("Skipping compile")
def getTop10Clusters(file_name, ids):
    clusters = np.asarray(
        import2dArray("../data/movies/rank/discrete/" + file_name + "P1.txt",
                      "s")).transpose()
    cluster_names = import2dArray(
        "../data/movies/cluster/hierarchy_names/" + file_name + "0.8400.txt",
        "s")
    for c in range(len(cluster_names)):
        cluster_names[c] = cluster_names[c][0]
    to_get = []
    for i in ids:
        for v in range(len(clusters[i])):
            rank = int(clusters[i][v][:-1])
            if rank <= 3:
                print(cluster_names[v][6:])
        print("----------------------")
def convertToPPMI(freq_arrays_fn, term_names_fn):
    freq_arrays = np.asarray(import2dArray(freq_arrays_fn, "i"))
    term_names = import1dArray(term_names_fn)
    ppmi_arrays = []
    overall = 0.0
    for f in freq_arrays:
        overall += sum(f)
    entity_array = [0] * 15000
    # For each term
    for t in range(len(freq_arrays)):
        ppmi_array = []
        term = sum(freq_arrays[t, :])
        term_p = term / overall
        for e in range(len(freq_arrays[t])):
            ppmi = 0.0
            freq = freq_arrays[t][e]
            if freq != 0:
                freq_p = freq / overall
                if entity_array[e] == 0:
                    entity = sum(freq_arrays[:, e])
                    entity_p = entity / overall
                    entity_array[e] = entity_p
                proba = freq_p / (entity_array[e] * term_p)
                ppmi = np.amax([0.0, np.log(proba)])
            ppmi_array.append(ppmi)
        print(ppmi_array)
        ppmi_arrays.append(ppmi_array)
        write1dArray(ppmi_array,
                     "../data/movies/bow/ppmi/class-" + term_names[t])
    write2dArray(ppmi_arrays, "../data/movies/bow/ppmi/class-all")
def getWordVectors(vector_save_fn, words_fn, wvn, wv_amt, svm_dir_fn=None):
    if os.path.exists(vector_save_fn) is False:
        glove_file = datapath('/home/tom/Downloads/glove.6B/glove.6B.' +
                              str(wv_amt) + 'd.txt')
        tmp_file = get_tmpfile(
            "/home/tom/Downloads/glove.6B/test_word2vec.txt")
        glove2word2vec(glove_file, tmp_file)
        svm_dir = import2dArray(svm_dir_fn)
        all_vectors = KeyedVectors.load_word2vec_format(tmp_file)
        vectors = []

        words = import1dArray(words_fn)
        for w in range(len(words)):
            try:
                if svm_dir_fn is None:
                    vectors.append(all_vectors.get_vector(words[w]))
                else:
                    vectors.append(
                        np.concatenate(
                            [all_vectors.get_vector(words[w]), svm_dir[w]]))
            except KeyError:
                if svm_dir_fn is None:
                    vectors.append(np.zeros(wv_amt))
                else:
                    vectors.append(np.zeros(wv_amt + len(svm_dir[0])))

        write2dArray(vectors, vector_save_fn)

        write1dArray(words, wvn)
    else:
        print("Already got word vectors", vector_save_fn)
def countClassFrequences(data_type, class_name):
    class_all = import2dArray("../data/" + data_type + "/classify/" +
                              class_name + "/class-all")
    class_names = import1dArray("../data/" + data_type + "/classify/" +
                                class_name + "/names.txt")
    counts = []
    class_all = np.asarray(class_all).transpose()
    for i in range(len(class_all)):
        count = len(np.nonzero(class_all[i])[0])
        print(class_names[i], count)
        counts.append(count)
def writeClassAll(class_fn, full_phrases_fn, phrases_used_fn, file_name):
    full_phrases = import1dArray(full_phrases_fn)
    #ppmi = np.asarray(import2dArray(class_fn)).transpose()
    ppmi = import2dArray(class_fn)
    new_ppmi = []
    phrases_used = import1dArray(phrases_used_fn)
    for p in range(len(full_phrases)):
        for pi in range(len(phrases_used)):
            if full_phrases[p] == phrases_used[pi]:
                new_ppmi.append(ppmi[p])
                break
    write2dArray(new_ppmi, file_name)
def averageWordVectors(id2word, ppmi_fn, size, data_type):
    bow = import2dArray(ppmi_fn)

    if len(bow[0]) != len(id2word.keys()):
        print("vocab and bow dont match", len(bow[0]), len(id2word.keys()))
        exit()
    print("Creating dict")
    print("Importing word vectors")
    glove_file = datapath(
        "D:/Dropbox/PhD/My Work/Code/Paper 2/data/raw/glove/glove.6B." +
        str(size) + 'd.txt')
    tmp_file = get_tmpfile(
        "D:/Dropbox/PhD/My Work/Code/Paper 2/data/raw/glove/test_word2vec.txt")
    glove2word2vec(glove_file, tmp_file)

    all_vectors = KeyedVectors.load_word2vec_format(tmp_file)
    print("Creating vectors")
    vectors = []
    i = 0
    for doc in bow:
        to_average = []
        for w in range(len(doc)):
            if doc[w] > 0:
                try:
                    to_average.append(
                        np.multiply(all_vectors.get_vector(id2word[w]),
                                    doc[w]))
                except KeyError:
                    print("keyerror", id2word[w])
        if len(to_average) == 0:
            to_average = [np.zeros(shape=size)]
            print("FAILED", i, "words:", len(to_average), "dim",
                  len(to_average[0]))
        else:
            print(i, "words:", len(to_average), "dim", len(to_average[0]))
        vectors.append(np.average(to_average, axis=0))
        i += 1

    np.save(
        "../data/" + data_type + "/nnet/spaces/wvPPMIFIXED" + str(size) +
        ".npy", vectors)
def averageWordVectorsFreq(id2word, freq_fn, size, data_type):
    glove_file = datapath("D:\Downloads\Work/glove.6B/glove.6B." + str(size) +
                          'd.txt')
    tmp_file = get_tmpfile("D:\Downloads\Work/glove.6B/test_word2vec.txt")
    bow = import2dArray(freq_fn, "i")

    print("Transposing PPMI")
    bow = bow.transpose()
    if len(bow[0]) != len(id2word.keys()):
        print("vocab and bow dont match", len(bow[0]), len(id2word.keys()))
        exit()
    print("Creating dict")
    print("Importing word vectors")
    glove2word2vec(glove_file, tmp_file)

    all_vectors = KeyedVectors.load_word2vec_format(tmp_file)
    print("Creating vectors")
    vectors = []
    i = 0
    for doc in bow:
        to_average = []
        for w in range(len(doc)):
            if doc[w] > 0:
                try:
                    to_average.append(all_vectors.get_vector(id2word[w]))
                except KeyError:
                    print("keyerror", id2word[w])
        if len(to_average) == 0:
            to_average = [np.zeros(shape=size)]
            print("FAILED", i, "words:", len(to_average), "dim",
                  len(to_average[0]))
        else:
            print(i, "words:", len(to_average), "dim", len(to_average[0]))
        vectors.append(np.average(to_average, axis=0))
        i += 1

    np.save(
        "../data/" + data_type + "/nnet/spaces/wvFIXED" + str(size) + ".npy",
        vectors)
def getNonZero(class_names_fn, file_name):
    class_names = import1dArray(class_names_fn, "s")
    class_all = np.asarray(import2dArray(file_name)).transpose()
    for c in range(len(class_all)):
        print(np.count_nonzero(class_all[c]))
def writeIndividualClasses(overall_class_fn, names_fn, output_filename):
    overall_class = import2dArray(overall_class_fn, "f")
    names = import1dArray(names_fn)
    for n in range(len(names)):
        write1dArray(overall_class[n], output_filename + "class-" + names[n])
        print(names[n])
def removeIndexes(file_name, indexes, type="f"):
    removed_indexes = []
    orig_array = import2dArray(file_name, type)
    removed_indexes = np.delete(orig_array, indexes, axis=0)
    write2dArray(removed_indexes, file_name[:-4] + "removedind.txt")
def shorten2dFloats(floats_fn):
    fa = import2dArray(floats_fn)
    for a in range(len(fa)):
        fa[a] = np.around(fa[a], decimals=4)
    return fa