Exemplo n.º 1
0
def logisticRegression(cluster_names_fn,
                       ranking_fn,
                       file_name,
                       do_p=False,
                       data_type="movies",
                       rewrite_files=False,
                       limit_entities=False,
                       classification="genres",
                       lowest_amt=0,
                       highest_amt=2147000000,
                       sparse_freqs_fn=None,
                       bow_names_fn=None):
    lr_fn = "../data/" + data_type + "/finetune/boc/" + file_name + ".txt"
    all_fns = [lr_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", bagOfClusters.__name__)
        return
    else:
        print("Running task", bagOfClusters.__name__)

    if limit_entities is False:
        classification = "all"

    cluster_names = dt.import2dArray(cluster_names_fn, "s")
    bow_names = dt.import1dArray(bow_names_fn, "s")
    sparse_freqs = dt.import2dArray(sparse_freqs_fn, return_sparse=True)

    frq = getLROnBag(cluster_names, data_type, lowest_amt, highest_amt,
                     classification, file_name, bow_names, sparse_freqs)

    dt.write2dArray(frq, lr_fn)
    return frq
Exemplo n.º 2
0
def bagOfClusters(cluster_names_fn,
                  ranking_fn,
                  file_name,
                  do_p=False,
                  data_type="movies",
                  rewrite_files=False,
                  limit_entities=False,
                  classification="genres",
                  lowest_amt=0,
                  highest_amt=2147000000):
    pavPPMI_fn = "../data/" + data_type + "/finetune/boc/" + file_name + ".txt"
    all_fns = [pavPPMI_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", bagOfClusters.__name__)
        return
    else:
        print("Running task", bagOfClusters.__name__)

    if limit_entities is False:
        classification = "all"

    ranking = dt.import2dArray(ranking_fn)
    names = dt.import2dArray(cluster_names_fn, "s")

    frq = writeBagOfClusters(names, data_type, lowest_amt, highest_amt,
                             classification)

    dt.write2dArray(frq, pavPPMI_fn)
    return frq
Exemplo n.º 3
0
def avgPPMI(cluster_names_fn,
            ranking_fn,
            file_name,
            do_p=False,
            data_type="movies",
            rewrite_files=False,
            classification="genres",
            lowest_amt=0,
            highest_amt=2147000000,
            limit_entities=False,
            save_results_so_far=False):
    pavPPMI_fn = "../data/" + data_type + "/finetune/" + file_name + ".txt"
    all_fns = [pavPPMI_fn]
    if dt.allFnsAlreadyExist(
            all_fns) and not rewrite_files or save_results_so_far:
        print("Skipping task", avgPPMI.__name__)
        return
    else:
        print("Running task", avgPPMI.__name__)

    if limit_entities is False:
        classification = "all"

    ranking = dt.import2dArray(ranking_fn)
    names = dt.import2dArray(cluster_names_fn, "s")

    for n in range(len(names)):
        for x in range(len(names[n])):
            if ":" in names[n][x]:
                names[n][x] = names[n][x][:-1]

    frq = []
    counter = 0

    for n in range(len(names)):
        name_frq = []
        for name in names[n]:
            name_frq.append(
                readPPMI(name, data_type, lowest_amt, highest_amt,
                         classification))
        avg_frq = []
        name_frq = np.asarray(name_frq).transpose()
        for name in name_frq:
            avg_frq.append(np.average(name))
        frq.append(np.asarray(avg_frq))
        print(n)

    dt.write2dArray(frq, pavPPMI_fn)
    return frq
Exemplo n.º 4
0
def bagOfClustersPavPPMI(cluster_names_fn,
                         ranking_fn,
                         file_name,
                         do_p=False,
                         data_type="movies",
                         rewrite_files=False,
                         limit_entities=False,
                         classification="genres",
                         lowest_amt=0,
                         highest_amt=2147000000,
                         sparse_freqs_fn=None,
                         bow_names_fn=None):

    pavPPMI_fn = "../data/" + data_type + "/finetune/boc/" + file_name + ".txt"
    all_fns = [pavPPMI_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", bagOfClustersPavPPMI.__name__)
        return
    else:
        print("Running task", bagOfClustersPavPPMI.__name__)

    if limit_entities is False:
        classification = "all"

    bow_names = dt.import1dArray(bow_names_fn, "s")
    sparse_freqs = dt.import2dArray(sparse_freqs_fn, return_sparse=True)
    ranking = dt.import2dArray(ranking_fn)
    cluster_names = dt.import2dArray(cluster_names_fn, "s")

    frq = getLROnBag(cluster_names, data_type, lowest_amt, highest_amt,
                     classification, file_name, bow_names, sparse_freqs)

    pav_classes = []

    for f in range(len(frq)):
        print(cluster_names[f])
        x = np.asarray(frq[f])
        y = ranking[f]

        ir = IsotonicRegression()
        y_ = ir.fit_transform(x, y)
        pav_classes.append(y_)
        if do_p:
            plot(x, y, y_)
        print(f)

    dt.write2dArray(pav_classes, pavPPMI_fn)
    return pav_classes
Exemplo n.º 5
0
def makeTopVectors(filename):

    vectors = dt.import2dArray("Rankings/" + filename + ".space")
    top250names = dt.import1dArray("filmdata/top250.txt")
    film_names = dt.import1dArray("filmdata/filmNames.txt")

    indexes = []
    ordered_names = []
    for f in range(len(film_names)):
        for t in top250names:
            if film_names[f] == t:
                indexes.append(f)
                ordered_names.append(t)

    top_vectors = [[]]
    for v in range(len(vectors)):
        if v > 0:
            top_vectors.append([])
        for i in range(len(vectors[v])):
            for id in indexes:
                if i == id:
                    top_vectors[v].append(vectors[v][i])

    dt.write2dArray(top_vectors, "Plots/Top174" + filename + ".space")
    dt.write1dArray(ordered_names, "Plots/Top174OrderedByOriginalList.txt")
Exemplo n.º 6
0
def plotClusters(filename):
    names = dt.import1dArray("Plots/Top174OrderedByOriginalList.txt")
    space = dt.import2dArray("Plots/Top174" + filename + ".space")
    cluster_names = dt.import1dArray(
        "Clusters/films100N0.6H25L3CutLeastSimilarHIGH0.75,0.67.names")

    #svd = TruncatedSVD(n_components=2, random_state=42)

    cx = 8
    cy = 9
    x = []
    y = []
    for s in space[cx]:
        x.append(s)
    for s in space[cy]:
        y.append(s)

    #svd_space = svd.fit_transform(space)

    fig, ax = plt.subplots()
    ax.scatter(x, y, picker=True)
    #for i, name in enumerate(found_names):
    #    ax.annotate(name, (x[i], y[i]))
    ax.set_xlabel(cluster_names[cx])
    ax.set_ylabel(cluster_names[cy])

    def onpick3(event):
        ind = event.ind
        print('onpick3 scatter:', names[ind[0]])

    fig.canvas.mpl_connect('pick_event', onpick3)

    plt.show()
Exemplo n.º 7
0
def selectCutOffByWordVector(cutoff_fn, cluster_dict_fn, file_name):
    cutoff = dt.import2dArray(cutoff_fn)
    cluster_dict = dt.readArrayDict(cluster_dict_fn)
    cutoff_words = []
    wv, wvn = dt.getWordVectors()
    cluster_boundary = 2
    cluster_dict_arrays = []
    for key, value in cluster_dict.items():
        cluster_array = []
        cluster_array.append(key)
        for v in value:
            cluster_array.append(v)
        cluster_dict_arrays.append(cluster_array)
    for c in range(len(cutoff)):
        clusters = []
        for i in range(len(cutoff[c])):
            cluster = []
            for x in range(len(cutoff[c]) - 1, -1, -1):
                if cutoff[c][x] is None or cutoff[c][i] is None:
                    continue
                if abs(cutoff[c][i] - cutoff[c][x]) <= cluster_boundary:
                    cluster.append(cluster_dict_arrays[c][x])
                    cutoff[c][x] = None
                    cluster_dict_arrays[c][x] = None
            if cluster is []:
                continue
            clusters.append(cluster)
        # Get the maximum similarity word vector value for each cluster, across all clusters
        for cl in range(len(clusters)):
            for wa in range(len(clusters[cl])):
                for w in range(len(clusters[cl][wa])):
                    clusters[cl[wa]]

    dt.write2dArray(cutoff_words,
                    "../data/movies/rules/cutoff/" + file_name + "WVN.txt")
def main(data_type, clf, min, max, depth, rewrite_files):
    dm_fn = "../data/" + data_type + "/mds/class-all-" + str(min) + "-" + str(max) \
                    + "-" + clf  + "dm"
    dm_shorten_fn = "../data/" + data_type + "/mds/class-all-" + str(min) + "-" + str(max) \
                    + "-" + clf  + "dmround"
    mds_fn = "../data/"+data_type+"/mds/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf+ "d" + str(depth)
    svd_fn = "../data/"+data_type+"/svd/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf + "d" + str(depth)
    pca_fn = "../data/"+data_type+"/pca/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf + "d" + str(depth)
    shorten_fn = "../data/" + data_type + "/bow/ppmi/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf+ "round"

    term_frequency_fn = init_vector_path = "../data/" + data_type + "/bow/ppmi/simple_numeric_stopwords_ppmi 2-all.npz"
    if dt.allFnsAlreadyExist([dm_fn, mds_fn, svd_fn, shorten_fn]):
        print("all files exist")
        exit()

    #Get MDS
    """
    tf = dt.import2dArray(term_frequency_fn).transpose()
    pca = sparseSVD(tf, depth)
    dt.write2dArray(pca, pca_fn)
    """

    # REMINDER: np.dot is WAY faster!
    tf = dt.import2dArray(term_frequency_fn, return_sparse=True)

    dm = getDsimMatrixDense(tf)
    dt.write2dArray(dm, dm_fn)
    print("wrote dm")
    """ Pretty sure none of this works
Exemplo n.º 9
0
def plotSVD(filename):
    names = dt.import1dArray("Plots/Top174OrderedByOriginalList.txt")
    space = dt.import2dArray("Plots/Top174" + filename + ".space")

    space = np.matrix.transpose(np.asarray(space))
    space = space.tolist()
    svd = TruncatedSVD(n_components=2, random_state=42)
    svd_space = svd.fit_transform(space)

    x = []
    y = []

    for s in svd_space:
        print(s)
        x.append(s[0])
        y.append(s[1])

    fig, ax = plt.subplots()
    ax.scatter(x, y, picker=True)

    # for i, name in enumerate(found_names):
    #    ax.annotate(name, (x[i], y[i]))

    def onpick3(event):
        ind = event.ind
        print('onpick3 scatter:', names[ind[0]])

    fig.canvas.mpl_connect('pick_event', onpick3)

    plt.show()
Exemplo n.º 10
0
def pavTermFrequency(ranking_fn, cluster_names_fn, fn, plot):
    ranking = dt.import2dArray(ranking_fn)
    names = dt.import1dArray(cluster_names_fn)
    frq = []
    counter = 0

    for name in names:
        frq.append(readFreq(name))

    pav_classes = []

    for f in range(len(frq)):
        print(names[f])
        x = np.asarray(frq[f])
        y = ranking[f]

        ir = IsotonicRegression()
        y_ = ir.fit_transform(x, y)
        pav_classes.append(y_)
        if plot:
            plot(x, y, y_)
        print(f)

    dt.write2dArray(
        pav_classes,
        "../data/movies/finetune/" + file_name + "PavTermFrequency.txt")
    return pav_classes
Exemplo n.º 11
0
def PPMIFT(cluster_names_fn,
           ranking_fn,
           file_name,
           do_p=False,
           data_type="movies",
           rewrite_files=False,
           limit_entities=False,
           classification="genres",
           lowest_amt=0,
           highest_amt=2147000000):
    pavPPMI_fn = "../data/" + data_type + "/finetune/" + file_name + ".txt"
    all_fns = [pavPPMI_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", pavPPMI.__name__)
        return
    else:
        print("Running task", pavPPMI.__name__)
    print("certainly still running that old pavPPMI task, yes sir")
    if limit_entities is False:
        classification = "all"

    ranking = dt.import2dArray(ranking_fn)
    names = dt.import1dArray(cluster_names_fn)
    frq = []
    counter = 0

    for name in names:
        name = name.split()[0]
        if ":" in name:
            name = name[:-1]
        frq.append(
            readPPMI(name, data_type, lowest_amt, highest_amt, classification))

    dt.write2dArray(frq, pavPPMI_fn)
    return frq
Exemplo n.º 12
0
def match_entities(entity_fn, t_entity_fn, entities_fn, classification):
    names = dt.import1dArray(entity_fn)
    t_names = dt.import1dArray(t_entity_fn)
    entities = dt.import2dArray(entities_fn)
    indexes_to_delete = []
    amount_found = 0
    for n in range(len(names)):
        names[n] = dt.removeEverythingFromString(names[n])
    for n in range(len(t_names)):
        t_names[n] = dt.removeEverythingFromString(t_names[n])
    matched_ids = []
    for n in range(len(t_names)):
        for ni in range(len(names)):
            matched_name = t_names[n]
            all_name = names[ni]
            if matched_name == all_name:
                print(matched_name)
                matched_ids.append(ni)
                break
    matched_entities = []
    for e in matched_ids:
        matched_entities.append(entities[e])

    print("Amount found", amount_found)
    dt.write2dArray(matched_entities, entities_fn[:len(entities_fn)-4] + "-" + classification + ".txt")
Exemplo n.º 13
0
def getAllRankings(directions_fn,
                   vectors_fn,
                   cluster_names_fn,
                   vector_names_fn,
                   percent,
                   percentage_increment,
                   by_vector,
                   fn,
                   discrete=True,
                   data_type="movies",
                   rewrite_files=False):

    #labels_fn = "../data/"+data_type+"/rank/labels/" + fn + ".txt"
    rankings_fn = "../data/" + data_type + "/rank/numeric/" + fn + ".txt"
    #discrete_labels_fn = "../data/"+data_type+"/rank/discrete/" + fn + ".txt"

    all_fns = [rankings_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        for f in all_fns:
            print(f, "Already exists")
        print("Skipping task", "getAllRankings")
        return
    else:
        print("Running task", "getAllRankings")

    directions = dt.import2dArray(directions_fn)
    vectors = dt.import2dArray(vectors_fn)
    cluster_names = dt.import1dArray(cluster_names_fn)
    vector_names = dt.import1dArray(vector_names_fn)
    rankings = getRankings(directions, vectors, cluster_names, vector_names)
    rankings = np.asarray(rankings)
    if discrete:
        labels = createLabels(rankings, percent)
        labels = np.asarray(labels)
        discrete_labels = createDiscreteLabels(rankings, percentage_increment)
        discrete_labels = np.asarray(discrete_labels)
    if by_vector:
        labels = labels.transpose()
        if discrete:
            discrete_labels = discrete_labels.transpose()
        rankings = rankings.transpose()
    if discrete:
        dt.write2dArray(labels, labels_fn)

    dt.write2dArray(rankings, rankings_fn)
    if discrete:
        dt.write2dArray(discrete_labels, discrete_labels_fn)
Exemplo n.º 14
0
def main(data_type, vector_size, window_size, min_count, sampling_threshold, negative_size,
                               train_epoch, dm, worker_count, train_wv, concatenate_wv, use_hierarchical_softmax):
    file_name = "Doc2Vec" + " VS" + str(vector_size) + " WS" + str(window_size) + " MC" + str(min_count) + " ST" + str(
        sampling_threshold) + \
                " NS" + str(negative_size) + " TE" + str(train_epoch) + " DM" + str(dm) + " WC" + str(
        worker_count) + "spacy"
    " NS" + str(negative_size) + " TE" + str(train_epoch) + " DM" + str(dm) + " WC" + str(worker_count) + \
    " TW" + str(train_wv) + " CW" + str(concatenate_wv) + " HS" + str(use_hierarchical_softmax)

    corpus_fn = "../data/raw/" + data_type + "/corpus_processed.txt"

    if os.path.exists(corpus_fn) is False:
        x_train = np.load("../data/raw/" + data_type + "/x_train_w.npy")
        x_test = np.load("../data/raw/" + data_type + "/x_test_w.npy")
        corpus = np.concatenate((x_train, x_test), axis=0)
        text_corpus = np.empty(len(corpus), dtype=np.object)
        for i in range(len(corpus)):
            text_corpus[i] = " ".join(corpus[i])
            print(text_corpus[i])
        dt.write1dArray(text_corpus, corpus_fn)

    embedding_fn = "/home/tom/Downloads/glove.6B/glove.6B.300d.txt"

    model_fn = "../data/" + data_type + "/doc2vec/" + file_name + ".bin"
    vector_fn = "../data/" + data_type + "/nnet/spaces/" + file_name + ".npy"
    score_fn = "../data/" + data_type + "/doc2vec/" + file_name + "catacc.score"

    if os.path.exists(model_fn):
        print("Imported model")
        model = g.utils.SaveLoad.load(model_fn)
    elif file_name[:7] == "Doc2Vec":
        model = doc2Vec(embedding_fn, corpus_fn, vector_size, window_size, min_count, sampling_threshold,
                        negative_size, train_epoch, dm, worker_count, train_wv, concatenate_wv, use_hierarchical_softmax)
        model.save(model_fn)

    if os.path.exists(vector_fn) is False:
        vectors = []
        for d in range(len(model.docvecs)):
            vectors.append(model.docvecs[d])
        np.save(vector_fn, vectors)
    else:
        print("Imported vectors")
        vectors = np.load(vector_fn)

    if os.path.exists(score_fn) is False or file_name[:6] != "Doc2Vec":
        print("Getting score")
        if data_type == "sentiment":
            classes = dt.import1dArray("../data/" + data_type + "/classify/" + data_type + "/class-all", "i")
            x_train, y_train, x_test, y_test = sentiment.getSplits(vectors, classes)
            scores = linearSVMScore(x_train, y_train, x_test, y_test)
        else:
            classes = dt.import2dArray("../data/" + data_type + "/classify/" + data_type + "/class-all", "i")
            x_train, y_train, x_test, y_test = newsgroups.getSplits(vectors, classes)
            scores = multiClassLinearSVM(x_train, y_train, x_test, y_test)
        print(scores)
        dt.write1dArray(scores, score_fn)
Exemplo n.º 15
0
def convertToTfIDF(data_type, lowest_count, highest_count, freq_arrays_fn, class_type):
    freq = np.asarray(dt.import2dArray(freq_arrays_fn))
    v = TfidfTransformer()
    x = v.fit_transform(freq)
    x = x.toarray()
    dt.write2dArray(x, "../data/"+data_type+"/bow/tfidf/class-all-"+str(lowest_count)+"-"+str(highest_count)+"-"+str(class_type))
    dt.writeClassAll("../data/"+data_type+"/bow/tfidf/class-all-"+str(lowest_count)+"-"+str(highest_count)+"-"+str(class_type),
                     "../data/"+data_type+"/bow/names/"+str(lowest_count)+"-"+str(highest_count)+"-"+str(class_type)+".txt",
                  "../data/"+data_type+"/bow/names/"+str(lowest_count)+"-"+str(highest_count)+"-"+str(class_type)+".txt",
                     "../data/"+data_type+"/bow/tfidf/class-all-"+str(lowest_count)+"-"+str(highest_count)+"-"+str(class_type))
Exemplo n.º 16
0
def pavPPMI(cluster_names_fn,
            ranking_fn,
            file_name,
            do_p=False,
            data_type="movies",
            rewrite_files=False,
            limit_entities=False,
            classification="genres",
            lowest_amt=0,
            highest_amt=2147000000):
    pavPPMI_fn = "../data/" + data_type + "/finetune/" + file_name + ".txt"
    all_fns = [pavPPMI_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", pavPPMI.__name__)
        return
    else:
        print("Running task", pavPPMI.__name__)
    print("certainly still running that old pavPPMI task, yes sir")
    if limit_entities is False:
        classification = "all"

    ranking = dt.import2dArray(ranking_fn)
    names = dt.import1dArray(cluster_names_fn)
    frq = []
    counter = 0

    for name in names:
        name = name.split()[0]
        if ":" in name:
            name = name[:-1]
        frq.append(
            readPPMI(name, data_type, lowest_amt, highest_amt, classification))

    pav_classes = []

    for f in range(len(frq)):
        try:
            print(names[f])
            x = np.asarray(frq[f])
            y = ranking[f]

            ir = IsotonicRegression()
            y_ = ir.fit_transform(x, y)
            pav_classes.append(y_)
            if do_p:
                plot(x, y, y_)
        except ValueError:
            print(names[f], "len ppmi",
                  len(frq[f], "len ranking", len(ranking[f])))
            exit()
        print(f)

    dt.write2dArray(pav_classes, pavPPMI_fn)
    return pav_classes
Exemplo n.º 17
0
def selectCutOffByExplanation(cutoff_fn, cluster_dict_fn, file_name):
    cutoff = dt.import2dArray(cutoff_fn)
    dupe_cutoff = copy.deepcopy(cutoff)
    cluster_dict = dt.readArrayDict(cluster_dict_fn)
    cutoff_words = []
    cluster_boundary = 2
    cluster_dict_arrays = []
    for key, value in cluster_dict.items():
        cluster_array = []
        cluster_array.append(key)
        for v in value:
            cluster_array.append(v)
        cluster_dict_arrays.append(cluster_array)
    explanations = []
    explanation_cutoffs = []
    for c in range(len(cutoff)):
        clusters = []
        for i in range(len(cutoff[c])):
            cluster = []
            for x in range(len(cutoff[c]) - 1, -1, -1):
                if cutoff[c][x] is None or cutoff[c][i] is None:
                    continue
                if abs(cutoff[c][i] - cutoff[c][x]) <= cluster_boundary:
                    cluster.append(cluster_dict_arrays[c][x])
                    cutoff[c][x] = None
                    cluster_dict_arrays[c][x] = None
            if cluster is []:
                continue
            clusters.append(cluster)
        # Get the m  vvcaximum similarity word vector value for each cluster, across all clusters
        # For each cluster
        explained_cutoff = []
        explained_cutoff_value = []
        for cl in range(len(clusters)):
            if len(clusters[cl]) == 0:
                print("Skipped")
                continue
            cluster_explanation, winning_index = webapi.getHighestScore(
                clusters[cl])
            explained_cutoff.append(cluster_explanation + ",")

            dict_index = 0
            for h in range(len(cluster_dict_arrays[cl])):
                if cluster_dict_arrays[cl][h] == clusters[cl][winning_index]:
                    dict_index = h
            explained_cutoff_value.append(dupe_cutoff[cl][dict_index])
        explanations.append(explained_cutoff)
        explanation_cutoffs.append(explained_cutoff_value)
    dt.write2dArray(
        explanations,
        "../data/movies/rules/final_names/" + file_name + "WVN.txt")
    dt.write2dArray(explanation_cutoffs,
                    "../data/movies/rules/final_cutoff/" + file_name + ".txt")
def saveClusters(directions_fn,
                 scores_fn,
                 names_fn,
                 filename,
                 amt_of_dirs,
                 data_type,
                 cluster_amt,
                 rewrite_files=False,
                 algorithm="meanshift_k"):

    dict_fn = "../data/" + data_type + "/cluster/dict/" + filename + ".txt"
    cluster_directions_fn = "../data/" + data_type + "/cluster/clusters/" + filename + ".txt"

    all_fns = [dict_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", saveClusters.__name__)
        return
    else:
        print("Running task", saveClusters.__name__)

    p_dir = dt.import2dArray(directions_fn)
    p_names = dt.import1dArray(names_fn, "s")
    p_scores = dt.import1dArray(scores_fn, "f")

    ids = np.argsort(p_scores)

    p_dir = np.flipud(p_dir[ids])[:amt_of_dirs]
    p_names = np.flipud(p_names[ids])[:amt_of_dirs]
    if algorithm == "meanshift":
        labels = meanShift(p_dir)
    else:
        labels = kMeans(p_dir, cluster_amt)
    unique, counts = np.unique(labels, return_counts=True)

    clusters = []
    dir_clusters = []
    for i in range(len(unique)):
        clusters.append([])
        dir_clusters.append([])
    for i in range(len(labels)):
        clusters[labels[i]].append(p_names[i])
        dir_clusters[labels[i]].append(p_dir[i])
    cluster_directions = []
    for l in range(len(dir_clusters)):
        cluster_directions.append(dt.mean_of_array(dir_clusters[l]))

    print("------------------------")
    for c in clusters:
        print(c)
    print("------------------------")

    dt.write2dArray(clusters, dict_fn)
    dt.write2dArray(cluster_directions, cluster_directions_fn)
Exemplo n.º 19
0
def getAllPhraseRankings(directions_fn=None,
                         vectors_fn=None,
                         property_names_fn=None,
                         vector_names_fn=None,
                         fn="no filename",
                         percentage_increment=1,
                         scores_fn=None,
                         top_amt=0,
                         discrete=False,
                         data_type="movies",
                         rewrite_files=False):
    rankings_fn_all = "../data/" + data_type + "/rank/numeric/" + fn + "ALL.txt"

    all_fns = [rankings_fn_all]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", "getAllPhraseRankings")
        return
    else:
        print("Running task", "getAllPhraseRankings")

    directions = dt.import2dArray(directions_fn)
    vectors = dt.import2dArray(vectors_fn)
    property_names = dt.import1dArray(property_names_fn)
    vector_names = dt.import1dArray(vector_names_fn)
    if top_amt != 0:
        scores = dt.import1dArray(scores_fn, "f")
        directions = dt.sortByReverseArray(directions, scores)[:top_amt]
        property_names = dt.sortByReverseArray(property_names,
                                               scores)[:top_amt]

    rankings = getRankings(directions, vectors, property_names, vector_names)
    if discrete:
        discrete_labels = createDiscreteLabels(rankings, percentage_increment)
        discrete_labels = np.asarray(discrete_labels)
    for a in range(len(rankings)):
        rankings[a] = np.around(rankings[a], decimals=4)
    #dt.write1dArray(property_names, "../data/movies/bow/names/top5kof17k.txt")

    dt.write2dArray(rankings, rankings_fn_all)
Exemplo n.º 20
0
def plotTopVectors(filename):

    names = dt.import1dArray(
        "../data/movies/plot/Top174OrderedByOriginalList.txt")
    space = dt.import2dArray("../data/movies/plot/Top174" + filename +
                             ".space")

    svd = TruncatedSVD(n_components=2, random_state=42)

    svd_space = svd.fit_transform(space)
    pl.plot(space[0], 'rx')
    pl.show()
    """
Exemplo n.º 21
0
def trimRankings(rankings_fn, available_indexes_fn, names, folder_name):
    available_indexes = dt.import1dArray(available_indexes_fn)
    rankings = np.asarray(dt.import2dArray(rankings_fn))
    names = dt.import1dArray(names)
    trimmed_rankings = []
    for r in range(len(rankings)):
        trimmed = rankings[r].take(available_indexes)
        trimmed_rankings.append(trimmed)
    for a in range(len(trimmed_rankings)):
        print("Writing", names[a])
        dt.write1dArray(trimmed_rankings[a], folder_name + "class-" + names[a])
    print("Writing", rankings_fn[-6:])
    dt.write2dArray(trimmed_rankings, folder_name + "class-" + rankings_fn[-6:])
Exemplo n.º 22
0
def getNDCG(rankings_fn,
            fn,
            data_type,
            bow_fn,
            ppmi_fn,
            lowest_count,
            rewrite_files=False,
            highest_count=0,
            classification=""):

    # Check if the NDCG scores have already been calculated, if they have then skip.
    ndcg_fn = "../data/" + data_type + "/ndcg/" + fn + ".txt"

    all_fns = [ndcg_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", getNDCG.__name__)
        return
    else:
        print("Running task", getNDCG.__name__)

    # Get the file names for the PPMI values for every word and a list of words ("names")
    names = dt.import1dArray("../data/" + data_type + "/bow/names/" + bow_fn)
    ppmi = dt.import2dArray("../data/" + data_type + "/bow/ppmi/" + ppmi_fn)
    # Process the rankings and the PPMI line-by-line so as to not run out of memory
    ndcg_a = []
    #spearman_a = []
    with open(rankings_fn) as rankings:
        r = 0
        for lr in rankings:
            for lp in ppmi:
                # Get the plain-number ranking of the rankings, e.g. "1, 4, 3, 50"
                sorted_indices = np.argsort(
                    list(map(float,
                             lr.strip().split())))[::-1]
                # Convert PPMI scores to floats
                # Get the NDCG score for the PPMI score, which is a valuation, compared to the indice of the rank
                ndcg = ndcg_from_ranking(lp, sorted_indices)

                # Add to array and print
                ndcg_a.append(ndcg)
                print("ndcg", ndcg, names[r], r)
                """
                    smr = spearmanr(ppmi_indices, sorted_indices)[1]
                    spearman_a.append(smr)
                    print("spearman", smr, names[r], r)
                    """
                r += 1
                break
    # Save NDCG
    dt.write1dArray(ndcg_a, ndcg_fn)
Exemplo n.º 23
0
def makeTopVectorsDirections(filename):
    vectors = dt.import2dArray("Directions/" + filename + "Cut.directions")
    top250names = dt.import1dArray("filmdata/top250.txt")
    filmnames = dt.import1dArray("filmdata/filmNames.txt")

    top250vectors = []

    for f in range(len(filmnames)):
        for t in range(len(top250names)):
            if filmnames[f] == top250names[t]:
                top250vectors.append(vectors[t])

    dt.write2dArray(top250vectors,
                    "../data/movies/plot/t250" + filename + ".directions")
 def denoisingAutoencoder(self, noise, deep_size):
     entity_vectors = np.asarray(dt.import2dArray(self.vector_path))
     if len(entity_vectors) != 15000:
         entity_vectors = entity_vectors.transpose()
     if self.class_path is None:
         entity_classes = entity_vectors
     else:
         entity_classes = np.asarray(dt.import2dArray(self.class_path))
     input_size = len(entity_vectors[0])
     output_size = len(entity_classes[0])
     if self.dropout_noise is None:
         self.model.add(GaussianNoise(noise, input_shape=(input_size, )))
     else:
         self.model.add(
             Dropout(self.dropout_noise[0], input_shape=(input_size, )))
     if deep_size is not None:
         self.model.add(
             Dense(output_dim=deep_size,
                   input_dim=self.hidden_layer_size,
                   init=self.layer_init,
                   activation=self.hidden_activation,
                   W_regularizer=l2(self.reg),
                   activity_regularizer=activity_l2(self.activity_reg)))
     self.model.add(
         Dense(output_dim=self.hidden_layer_size,
               input_dim=input_size,
               init=self.layer_init,
               activation=self.hidden_activation,
               W_regularizer=l2(self.reg)))
     self.model.add(
         Dense(output_dim=output_size,
               init=self.layer_init,
               activation=self.output_activation,
               W_regularizer=l2(self.reg)))
     self.model.compile(loss=self.loss, optimizer=self.optimizer)
     return entity_vectors, entity_classes
Exemplo n.º 25
0
def main(data_type, clf, highest_amt, lowest_amt, depth, rewrite_files):

    min = lowest_amt
    max = highest_amt
    dm_fn = "../data/" + data_type + "/mds/class-all-" + str(min) + "-" + str(max) \
                    + "-" + clf  + "dm"
    dm_shorten_fn = "../data/" + data_type + "/mds/class-all-" + str(min) + "-" + str(max) \
                    + "-" + clf  + "dmround"
    mds_fn = "../data/"+data_type+"/mds/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf+ "d" + str(depth)
    svd_fn = "../data/"+data_type+"/svd/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf + "d" + str(depth)
    pca_fn = "../data/"+data_type+"/pca/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf + "d" + str(depth)
    shorten_fn = "../data/" + data_type + "/bow/ppmi/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf+ "round"

    term_frequency_fn = init_vector_path = "../data/" + data_type + "/bow/ppmi/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf
    if dt.allFnsAlreadyExist([dm_fn, mds_fn, svd_fn, shorten_fn]):
        print("all files exist")
        exit()
    if dt.fileExists(dm_fn) is False:
        newsgroups_train = fetch_20newsgroups(subset='train', shuffle=False)
        newsgroups_test = fetch_20newsgroups(subset='test', shuffle=False)


        vectors = np.concatenate((newsgroups_train.data, newsgroups_test.data), axis=0)
        newsgroups_test = None
        newsgroups_train = None
        # Get sparse tf rep
        tf_vectorizer = CountVectorizer(max_df=highest_amt, min_df=lowest_amt, stop_words='english')
        print("completed vectorizer")
        tf = tf_vectorizer.fit_transform(vectors)
        vectors = None
        # Get sparse PPMI rep from sparse tf rep
        print("done ppmisaprse")
        sparse_ppmi = convertPPMISparse(tf)
        # Get sparse Dsim matrix from sparse PPMI rep
        dm = getDissimilarityMatrixSparse(sparse_ppmi)
        dt.write2dArray(dm, dm_fn)
    else:
        dm = dt.import2dArray(dm_fn)
    print("starting mds")
    # Use as input to mds
    mds = createMDS(dm, depth)
    # save MDS
    dt.write2dArray(mds, mds_fn)
Exemplo n.º 26
0
def getDissimilarityMatrixSparse(tf):
    tflen = tf.shape[0]
    dm = np.empty([tflen, tflen], dtype="float64")
    pithing = 2/pi
    norms = np.empty(tflen, dtype="float64")

    #Calculate norms
    for ei in range(tflen):
        norms[ei] = spl.norm(tf[ei])
        print("norm", ei)

    dot_product = np.zeros([tflen, tflen], dtype="float64")

    use_old_dp = True
    if use_old_dp:
       dot_product = dt.import2dArray("dotproduct.temp")
    else:
        #Calculate dot products
        for ei in range(tflen):
            for ej in range(tflen):
                if dot_product[ej][ei] != 0:
                    dot_product[ei][ej] = dot_product[ej][ei]
                    continue
                dot_product[ei][ej] = tf[ei].dot(tf[ej].T)[0,0]
            print("dp", ei)
        dt.write2dArray(dot_product, "dotproduct.temp")

    norm_multiplied = np.empty([tflen, tflen], dtype="float64")

    # Calculate dot products
    for ei in range(tflen):
        for ej in range(tflen):
            norm_multiplied[ei][ej] = norms[ei] * norms[ej]
        print("dp", ei)

    norm_multiplied = dt.shortenFloatsNoFn(norm_multiplied)
    dot_product = dt.shortenFloatsNoFn(dot_product)

    #Get angular differences
    for ei in range(tflen):
        for ej in range(tflen):
            ang = pithing * np.arccos(dot_product[ei][ej] / norm_multiplied[ei][ej])
            dm[ei][ej] = ang
        print(ei)
    return dm
Exemplo n.º 27
0
def main(min, max, data_type, raw_fn, extension, cut_first_line, additional_name, make_individual, entity_name_fn,
         use_all_files, sparse_matrix, word_count_amt, classification):

    getVectors(raw_fn, entity_name_fn, extension, "../data/"+data_type+"/bow/",
           min, max, cut_first_line, get_all, additional_name,  make_individual, classification, use_all_files, 1000, data_type,
               sparse_matrix)

    bow = sp.csr_matrix(dt.import2dArray("../data/"+data_type+"/bow/frequency/phrases/class-all-"+str(min)+"-" + str(max)+"-"+classification))
    dt.write2dArray(convertPPMI( bow), "../data/"+data_type+"/bow/ppmi/class-all-"+str(min)+"-"+str(max)+"-" + classification)

    print("indiviual from all")
    printIndividualFromAll(data_type, "ppmi", min, max,  classification)

    printIndividualFromAll(data_type, "binary/phrases", min, max,  classification)

    convertToTfIDF(data_type, min, max, "../data/"+data_type+"/bow/frequency/phrases/class-all-"+str(min)+"-"+str(max)+"-"+classification, classification)

    printIndividualFromAll(data_type, "tfidf", min, max,  classification)
Exemplo n.º 28
0
def getCutOff(cluster_dict_fn, rankings_fn, file_name):

    cluster_dict = dt.readArrayDict(cluster_dict_fn)
    rankings = dt.importDiscreteVectors(rankings_fn)

    for r in rankings:
        for a in range(len(r)):
            r[a] = int(r[a][:-1])

    cutoff_clusters = []
    counter = 0
    for key, value in cluster_dict.items():
        value.insert(0, key)
        cutoffs = []
        for v in value:
            max_score = 0
            cutoff = 0
            for i in range(1, 101):
                y_pred = []
                for ve in range(len(rankings[counter])):
                    rank = rankings[counter][ve]
                    if rank > i:
                        y_pred.append(0)
                    else:
                        y_pred.append(1)
                y_test = dt.import2dArray(
                    "../data/movies/bow/frequency/phrases/class-" + v, "s")
                score = cohen_kappa_score(y_test, y_pred)
                print(v, int(i), "Score", score)
                if score > max_score:
                    max_score = score
                    cutoff = i
            cutoffs.append(cutoff)
            print("Cutoff for", v, "On", key, "Was", str(cutoff))
        cutoff_clusters.append(cutoffs)
        counter += 1
    dt.write2dArray(cutoff_clusters,
                    "../data/movies/rules/cutoff/" + file_name + ".txt")
Exemplo n.º 29
0
    def __init__(self,
                 vector_path,
                 class_path,
                 property_names_fn,
                 file_name,
                 svm_type,
                 training_size=10000,
                 lowest_count=200,
                 highest_count=21470000,
                 get_kappa=True,
                 get_f1=True,
                 single_class=True,
                 data_type="movies",
                 getting_directions=True,
                 threads=1,
                 chunk_amt=0,
                 chunk_id=0,
                 rewrite_files=False,
                 classification="all",
                 loc="../data/"):

        self.get_kappa = True
        self.get_f1 = get_f1
        self.data_type = data_type
        self.classification = classification
        self.lowest_amt = lowest_count
        self.higher_amt = highest_count

        if chunk_amt > 0:
            file_name = file_name + " CID" + str(chunk_id) + " CAMT" + str(
                chunk_amt)

        directions_fn = loc + data_type + "/svm/directions/" + file_name + ".txt"
        ktau_scores_fn = loc + data_type + "/svm/f1/" + file_name + ".txt"
        kappa_fn = loc + data_type + "/svm/kappa/" + file_name + ".txt"
        acc_fn = loc + data_type + "/svm/acc/" + file_name + ".txt"

        all_fns = [directions_fn, kappa_fn]
        if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
            print("Skipping task", "getSVMResults")
            return
        else:
            print("Running task", "getSVMResults")

        y_train = 0
        y_test = 0
        vectors = np.asarray(dt.import2dArray(vector_path))
        print("imported vectors")
        if not getting_directions:
            classes = np.asarray(dt.import2dArray(class_path))
            print("imported classes")
        property_names = dt.import1dArray(property_names_fn)
        print("imported propery names")
        if chunk_amt > 0:
            if chunk_id == chunk_amt - 1:
                chunk = int(len(property_names) / chunk_amt)
                multiply = chunk_amt - 1
                property_names = property_names[chunk * multiply:]
            else:
                property_names = dt.chunks(
                    property_names, int(
                        (len(property_names) / chunk_amt)))[chunk_id]

        if not getting_directions:
            x_train, x_test, y_train, y_test = train_test_split(vectors,
                                                                classes,
                                                                test_size=0.3,
                                                                random_state=0)
        else:
            x_train = vectors
            x_test = vectors

        if get_f1:
            y_train = y_train.transpose()
            y_test = y_test.transpose()
            print("transpoosed")
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test

        if self.get_f1 is False:
            print("running svms")
            kappa_scores, directions, ktau_scores, property_names = self.runAllSVMs(
                y_test, y_train, property_names, file_name, svm_type,
                getting_directions, threads)

            dt.write1dArray(kappa_scores, kappa_fn)
            dt.write2dArray(directions, directions_fn)
            dt.write1dArray(ktau_scores, ktau_scores_fn)
            dt.write1dArray(property_names,
                            property_names_fn + file_name + ".txt")
        else:
            final_f1 = []
            final_acc = []
            for y in range(len(y_train)):
                f1, acc = self.runClassifySVM(y_test[y], y_train[y])
                print(f1, acc)
                final_f1.append(f1)
                final_acc.append(acc)
            dt.write1dArray(final_f1, ktau_scores_fn)
            dt.write1dArray(final_acc, acc_fn)
Exemplo n.º 30
0
    def __init__(self,
                 features_fn,
                 classes_fn,
                 class_names_fn,
                 cluster_names_fn,
                 filename,
                 max_depth=None,
                 balance=None,
                 criterion="entropy",
                 save_details=False,
                 data_type="movies",
                 cv_splits=5,
                 csv_fn="../data/temp/no_csv_provided.csv",
                 rewrite_files=True,
                 split_to_use=-1,
                 development=False,
                 limit_entities=False,
                 limited_label_fn=None,
                 vector_names_fn=None,
                 pruning=1,
                 save_results_so_far=False):

        vectors = np.asarray(dt.import2dArray(features_fn)).transpose()

        labels = np.asarray(dt.import2dArray(classes_fn, "i"))

        print("vectors", len(vectors), len(vectors[0]))
        print("labels", len(labels), len(labels[0]))
        print("vectors", len(vectors), len(vectors[0]))
        cluster_names = dt.import1dArray(cluster_names_fn)
        label_names = dt.import1dArray(class_names_fn)
        all_fns = []
        file_names = ['ACC J48' + filename, 'F1 J48' + filename]
        acc_fn = '../data/' + data_type + '/rules/tree_scores/' + file_names[
            0] + '.scores'
        f1_fn = '../data/' + data_type + '/rules/tree_scores/' + file_names[
            1] + '.scores'
        all_fns.append(acc_fn)
        all_fns.append(f1_fn)
        all_fns.append(csv_fn)

        print(dt.allFnsAlreadyExist(all_fns), rewrite_files)

        if dt.allFnsAlreadyExist(
                all_fns) and not rewrite_files or save_results_so_far:
            print("Skipping task", "Weka Tree")
            return
        else:
            print("Running task", "Weka Tree")

        for l in range(len(cluster_names)):
            cluster_names[l] = cluster_names[l].split()[0]
        """
        for l in range(len(label_names)):
            if label_names[l][:6] == "class-":
                label_names[l] = label_names[l][6:]
        """
        f1_array = []
        accuracy_array = []

        labels = labels.transpose()
        print("labels transposed")
        print("labels", len(labels), len(labels[0]))

        if limit_entities is False:
            vector_names = dt.import1dArray(vector_names_fn)
            limited_labels = dt.import1dArray(limited_label_fn)
            vectors = np.asarray(
                dt.match_entities(vectors, limited_labels, vector_names))

        all_y_test = []
        all_predictions = []
        for l in range(len(labels)):

            if balance:
                new_vectors, new_labels = dt.balanceClasses(vectors, labels[l])
            else:
                new_vectors = vectors
                new_labels = labels[l]
            # Select training data with cross validation

            ac_y_test = []
            ac_y_train = []
            ac_x_train = []
            ac_x_test = []
            ac_y_dev = []
            ac_x_dev = []
            cv_f1 = []
            cv_acc = []
            if cv_splits == 1:
                kf = KFold(n_splits=3, shuffle=False, random_state=None)
            else:
                kf = KFold(n_splits=cv_splits,
                           shuffle=False,
                           random_state=None)
            c = 0
            for train, test in kf.split(new_vectors):
                if split_to_use > -1:
                    if c != split_to_use:
                        c += 1
                        continue
                ac_y_test.append(new_labels[test])
                ac_y_train.append(new_labels[train[int(len(train) * 0.2):]])
                val = int(len(train) * 0.2)
                t_val = train[val:]
                nv_t_val = new_vectors[t_val]
                ac_x_train.append(nv_t_val)
                ac_x_test.append(new_vectors[test])
                ac_x_dev.append(new_vectors[train[:int(len(train) * 0.2)]])
                ac_y_dev.append(new_labels[train[:int(len(train) * 0.2)]])
                c += 1
                if cv_splits == 1:
                    break

            predictions = []
            rules = []

            if development:
                ac_x_test = np.copy(np.asarray(ac_x_dev))
                ac_y_test = np.copy(np.asarray(ac_y_dev))

            train_fn = "../data/" + data_type + "/weka/data/" + filename + "Train.txt"
            test_fn = "../data/" + data_type + "/weka/data/" + filename + "Test.txt"

            for splits in range(len(ac_y_test)):

                # Get the weka predictions
                dt.writeArff(ac_x_train[splits], [ac_y_train[splits]],
                             [label_names[splits]],
                             train_fn,
                             header=True)
                dt.writeArff(ac_x_test[splits], [ac_y_test[splits]],
                             [label_names[splits]],
                             test_fn,
                             header=True)
                prediction, rule = self.getWekaPredictions(
                    train_fn + label_names[splits] + ".arff",
                    test_fn + label_names[splits] + ".arff", save_details,
                    pruning)
                predictions.append(prediction)
                rules.append(rule)

            for i in range(len(predictions)):
                if len(predictions) == 1:
                    all_y_test.append(ac_y_test[i])
                    all_predictions.append(predictions[i])
                f1 = f1_score(ac_y_test[i], predictions[i], average="binary")
                accuracy = accuracy_score(ac_y_test[i], predictions[i])
                cv_f1.append(f1)
                cv_acc.append(accuracy)
                scores = [[label_names[l], "f1", f1, "accuracy", accuracy]]
                print(scores)

                # Export a tree for each label predicted by the clf, not sure if this is needed...
                if save_details:
                    data_fn = "../data/" + data_type + "/rules/weka_rules/" + label_names[
                        l] + " " + filename + ".txt"
                    class_names = [label_names[l], "NOT " + label_names[l]]
                    #self.get_code(clf, cluster_names, class_names, label_names[l] + " " + filename, data_type)
                    dt.write1dArray(rules[i].split("\n"), data_fn)
                    dot_file = dt.import1dArray(data_fn)
                    new_dot_file = []
                    for line in dot_file:
                        if "->" not in line and "label" in line and '"t ' not in line and '"f ' not in line:
                            line = line.split('"')
                            line[1] = '"' + cluster_names[int(line[1])] + '"'
                            line = "".join(line)
                        new_dot_file.append(line)
                    dt.write1dArray(new_dot_file, data_fn)
                    graph = pydot.graph_from_dot_file(data_fn)
                    graph.write_png("../data/" + data_type +
                                    "/rules/weka_images/" + label_names[l] +
                                    " " + filename + ".png")
            f1_array.append(np.average(np.asarray(cv_f1)))
            accuracy_array.append(np.average(np.asarray(cv_acc)))

        accuracy_array = np.asarray(accuracy_array)
        accuracy_average = np.average(accuracy_array)
        accuracy_array = accuracy_array.tolist()
        f1_array = np.asarray(f1_array)
        f1_average = np.average(f1_array)
        f1_array = f1_array.tolist()
        micro_average = f1_score(np.asarray(all_y_test),
                                 np.asarray(all_predictions),
                                 average="micro")

        print("Micro F1", micro_average)

        accuracy_array.append(accuracy_average)
        accuracy_array.append(0.0)

        f1_array.append(f1_average)
        f1_array.append(micro_average)

        scores = [accuracy_array, f1_array]

        dt.write1dArray(accuracy_array, acc_fn)
        dt.write1dArray(f1_array, f1_fn)

        print(csv_fn)
        if dt.fileExists(csv_fn):
            print("File exists, writing to csv")
            try:
                dt.write_to_csv(csv_fn, file_names, scores)
            except PermissionError:
                print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE")
                print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE")
                print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE")
                print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE")
                print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE")
                print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE")
                dt.write_to_csv(
                    csv_fn[:len(csv_fn) - 4] + str(random.random()) +
                    "FAIL.csv", file_names, scores)
        else:
            print("File does not exist, recreating csv")
            key = []
            for l in label_names:
                key.append(l)
            key.append("AVERAGE")
            key.append("MICRO AVERAGE")
            dt.write_csv(csv_fn, file_names, scores, key)