示例#1
0
def LDA(tf, names, components, file_name, doc_topic_prior, topic_word_prior,
        data_type, rewrite_files):
    # Removed model name as it was unused and I manually renamed a bunch of files and was too lazy to do model too
    rep_name = "../data/" + data_type + "/LDA/rep/" + file_name + ".txt"
    model_name = "../data/" + data_type + "/LDA/model/" + file_name + ".txt"
    names_name = "../data/" + data_type + "/LDA/names/" + file_name + ".txt"

    all_names = [rep_name, names_name]

    if dt.allFnsAlreadyExist(all_names) and not rewrite_files:
        print("Already completed")
        return
    print(len(tf), print(len(tf[0])))

    print("Fitting LDA models with tf features,")
    lda = LatentDirichletAllocation(doc_topic_prior=doc_topic_prior,
                                    topic_word_prior=topic_word_prior,
                                    n_topics=components)
    t0 = time()
    tf = np.asarray(tf).transpose()
    new_rep = lda.fit_transform(tf)
    print("done in %0.3fs." % (time() - t0))

    print("\nTopics in LDA model:")
    topics = print_top_words(lda, names)
    topics.reverse()
    dt.write1dArray(
        topics, "../data/" + data_type + "/LDA/names/" + file_name + ".txt")
    dt.write2dArray(new_rep.transpose(), rep_name)
    joblib.dump(lda, model_name)
示例#2
0
def match_entities(entity_fn, t_entity_fn, entities_fn, classification):
    names = dt.import1dArray(entity_fn)
    t_names = dt.import1dArray(t_entity_fn)
    entities = dt.import2dArray(entities_fn)
    indexes_to_delete = []
    amount_found = 0
    for n in range(len(names)):
        names[n] = dt.removeEverythingFromString(names[n])
    for n in range(len(t_names)):
        t_names[n] = dt.removeEverythingFromString(t_names[n])
    matched_ids = []
    for n in range(len(t_names)):
        for ni in range(len(names)):
            matched_name = t_names[n]
            all_name = names[ni]
            if matched_name == all_name:
                print(matched_name)
                matched_ids.append(ni)
                break
    matched_entities = []
    for e in matched_ids:
        matched_entities.append(entities[e])

    print("Amount found", amount_found)
    dt.write2dArray(matched_entities, entities_fn[:len(entities_fn)-4] + "-" + classification + ".txt")
def main(data_type, clf, min, max, depth, rewrite_files):
    dm_fn = "../data/" + data_type + "/mds/class-all-" + str(min) + "-" + str(max) \
                    + "-" + clf  + "dm"
    dm_shorten_fn = "../data/" + data_type + "/mds/class-all-" + str(min) + "-" + str(max) \
                    + "-" + clf  + "dmround"
    mds_fn = "../data/"+data_type+"/mds/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf+ "d" + str(depth)
    svd_fn = "../data/"+data_type+"/svd/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf + "d" + str(depth)
    pca_fn = "../data/"+data_type+"/pca/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf + "d" + str(depth)
    shorten_fn = "../data/" + data_type + "/bow/ppmi/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf+ "round"

    term_frequency_fn = init_vector_path = "../data/" + data_type + "/bow/ppmi/simple_numeric_stopwords_ppmi 2-all.npz"
    if dt.allFnsAlreadyExist([dm_fn, mds_fn, svd_fn, shorten_fn]):
        print("all files exist")
        exit()

    #Get MDS
    """
    tf = dt.import2dArray(term_frequency_fn).transpose()
    pca = sparseSVD(tf, depth)
    dt.write2dArray(pca, pca_fn)
    """

    # REMINDER: np.dot is WAY faster!
    tf = dt.import2dArray(term_frequency_fn, return_sparse=True)

    dm = getDsimMatrixDense(tf)
    dt.write2dArray(dm, dm_fn)
    print("wrote dm")
    """ Pretty sure none of this works
示例#4
0
def pavTermFrequency(ranking_fn, cluster_names_fn, fn, plot):
    ranking = dt.import2dArray(ranking_fn)
    names = dt.import1dArray(cluster_names_fn)
    frq = []
    counter = 0

    for name in names:
        frq.append(readFreq(name))

    pav_classes = []

    for f in range(len(frq)):
        print(names[f])
        x = np.asarray(frq[f])
        y = ranking[f]

        ir = IsotonicRegression()
        y_ = ir.fit_transform(x, y)
        pav_classes.append(y_)
        if plot:
            plot(x, y, y_)
        print(f)

    dt.write2dArray(
        pav_classes,
        "../data/movies/finetune/" + file_name + "PavTermFrequency.txt")
    return pav_classes
def selectCutOffByWordVector(cutoff_fn, cluster_dict_fn, file_name):
    cutoff = dt.import2dArray(cutoff_fn)
    cluster_dict = dt.readArrayDict(cluster_dict_fn)
    cutoff_words = []
    wv, wvn = dt.getWordVectors()
    cluster_boundary = 2
    cluster_dict_arrays = []
    for key, value in cluster_dict.items():
        cluster_array = []
        cluster_array.append(key)
        for v in value:
            cluster_array.append(v)
        cluster_dict_arrays.append(cluster_array)
    for c in range(len(cutoff)):
        clusters = []
        for i in range(len(cutoff[c])):
            cluster = []
            for x in range(len(cutoff[c]) - 1, -1, -1):
                if cutoff[c][x] is None or cutoff[c][i] is None:
                    continue
                if abs(cutoff[c][i] - cutoff[c][x]) <= cluster_boundary:
                    cluster.append(cluster_dict_arrays[c][x])
                    cutoff[c][x] = None
                    cluster_dict_arrays[c][x] = None
            if cluster is []:
                continue
            clusters.append(cluster)
        # Get the maximum similarity word vector value for each cluster, across all clusters
        for cl in range(len(clusters)):
            for wa in range(len(clusters[cl])):
                for w in range(len(clusters[cl][wa])):
                    clusters[cl[wa]]

    dt.write2dArray(cutoff_words,
                    "../data/movies/rules/cutoff/" + file_name + "WVN.txt")
示例#6
0
def logisticRegression(cluster_names_fn,
                       ranking_fn,
                       file_name,
                       do_p=False,
                       data_type="movies",
                       rewrite_files=False,
                       limit_entities=False,
                       classification="genres",
                       lowest_amt=0,
                       highest_amt=2147000000,
                       sparse_freqs_fn=None,
                       bow_names_fn=None):
    lr_fn = "../data/" + data_type + "/finetune/boc/" + file_name + ".txt"
    all_fns = [lr_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", bagOfClusters.__name__)
        return
    else:
        print("Running task", bagOfClusters.__name__)

    if limit_entities is False:
        classification = "all"

    cluster_names = dt.import2dArray(cluster_names_fn, "s")
    bow_names = dt.import1dArray(bow_names_fn, "s")
    sparse_freqs = dt.import2dArray(sparse_freqs_fn, return_sparse=True)

    frq = getLROnBag(cluster_names, data_type, lowest_amt, highest_amt,
                     classification, file_name, bow_names, sparse_freqs)

    dt.write2dArray(frq, lr_fn)
    return frq
示例#7
0
def getLROnBag(cluster_dict, data_type, lowest_amt, highest_amt,
               classification, file_name, names, sparse_freqs):
    bag_of_clusters = []
    # Note, prior we used the PPMI values directly here somehow...
    for c in range(len(cluster_dict)):
        # Remove the colons
        for f in range(len(cluster_dict[c])):
            if ":" in cluster_dict[c][f]:
                cluster_dict[c][f] = cluster_dict[c][f][:-1]
        # Add all of the frequences together to make a bag-of-clusters
        name = cluster_dict[c][0]
        word_array = sparse_freqs[np.where(names == name)].toarray()
        accum_freqs = np.zeros(shape=len(word_array), dtype=np.int64)
        # For all the cluster terms
        for name in cluster_dict[c]:
            if ":" in name:
                name = name[:-1]
            # Import the class
            class_to_add = sparse_freqs[np.where(names == name)].toarray()
            # Add the current class to the older one
            accum_freqs = np.add(accum_freqs, class_to_add)
        # Append this clusters frequences to the group of them
        bag_of_clusters.append(accum_freqs)
    # Convert to binary
    for c in range(len(bag_of_clusters)):
        bag_of_clusters[c][bag_of_clusters[c] > 1] = 1
        bag_of_clusters[c] = bag_of_clusters[c][
            0]  # For some reason the internal arrays are the single element of another array
    dt.write2dArray(bag_of_clusters,
                    "../data/" + data_type + "/bow/boc/" + file_name + ".txt")
    return bag_of_clusters
示例#8
0
def bagOfClusters(cluster_names_fn,
                  ranking_fn,
                  file_name,
                  do_p=False,
                  data_type="movies",
                  rewrite_files=False,
                  limit_entities=False,
                  classification="genres",
                  lowest_amt=0,
                  highest_amt=2147000000):
    pavPPMI_fn = "../data/" + data_type + "/finetune/boc/" + file_name + ".txt"
    all_fns = [pavPPMI_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", bagOfClusters.__name__)
        return
    else:
        print("Running task", bagOfClusters.__name__)

    if limit_entities is False:
        classification = "all"

    ranking = dt.import2dArray(ranking_fn)
    names = dt.import2dArray(cluster_names_fn, "s")

    frq = writeBagOfClusters(names, data_type, lowest_amt, highest_amt,
                             classification)

    dt.write2dArray(frq, pavPPMI_fn)
    return frq
示例#9
0
def makeTopVectors(filename):

    vectors = dt.import2dArray("Rankings/" + filename + ".space")
    top250names = dt.import1dArray("filmdata/top250.txt")
    film_names = dt.import1dArray("filmdata/filmNames.txt")

    indexes = []
    ordered_names = []
    for f in range(len(film_names)):
        for t in top250names:
            if film_names[f] == t:
                indexes.append(f)
                ordered_names.append(t)

    top_vectors = [[]]
    for v in range(len(vectors)):
        if v > 0:
            top_vectors.append([])
        for i in range(len(vectors[v])):
            for id in indexes:
                if i == id:
                    top_vectors[v].append(vectors[v][i])

    dt.write2dArray(top_vectors, "Plots/Top174" + filename + ".space")
    dt.write1dArray(ordered_names, "Plots/Top174OrderedByOriginalList.txt")
示例#10
0
def PPMIFT(cluster_names_fn,
           ranking_fn,
           file_name,
           do_p=False,
           data_type="movies",
           rewrite_files=False,
           limit_entities=False,
           classification="genres",
           lowest_amt=0,
           highest_amt=2147000000):
    pavPPMI_fn = "../data/" + data_type + "/finetune/" + file_name + ".txt"
    all_fns = [pavPPMI_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", pavPPMI.__name__)
        return
    else:
        print("Running task", pavPPMI.__name__)
    print("certainly still running that old pavPPMI task, yes sir")
    if limit_entities is False:
        classification = "all"

    ranking = dt.import2dArray(ranking_fn)
    names = dt.import1dArray(cluster_names_fn)
    frq = []
    counter = 0

    for name in names:
        name = name.split()[0]
        if ":" in name:
            name = name[:-1]
        frq.append(
            readPPMI(name, data_type, lowest_amt, highest_amt, classification))

    dt.write2dArray(frq, pavPPMI_fn)
    return frq
示例#11
0
def binaryClusterTerm(cluster_names_fn, fn):
    all_cluster_output = []
    cluster_names = dt.import1dArray(cluster_names_fn)
    for cn in cluster_names:
        binary = dt.import1dArray(
            "../data/movies/bow/binary/phrases/class-" + cn, "i")
        all_cluster_output.append(binary)
    dt.write2dArray(all_cluster_output,
                    "../data/movies/finetune/" + fn + "ClusterTerm.txt")
示例#12
0
def PPMI(cluster_names_fn, fn):
    all_cluster_output = []
    cluster_names = dt.import1dArray(cluster_names_fn)
    for cn in cluster_names:
        binary = dt.import1dArray("../data/movies/bow/ppmi/class-class-" + cn,
                                  "f")
        all_cluster_output.append(binary)
    dt.write2dArray(all_cluster_output,
                    "../data/movies/finetune/" + fn + "PPMI.txt")
def fixCutoffFormatting(cutoff_fn, file_name):
    cutoff = dt.import1dArray(cutoff_fn)
    cluster_dict = dt.readArrayDict(cluster_dict_fn)
    for c in range(len(cutoff)):
        cutoff[c] = cutoff[c].split()
        for i in range(len(cutoff[c])):
            cutoff[c][i] = int(dt.stripPunctuation(cutoff[c][i]))
    dt.write2dArray(cutoff,
                    "../data/movies/rules/cutoff/" + file_name + ".txt")
示例#14
0
def convertToTfIDF(data_type, lowest_count, highest_count, freq_arrays_fn, class_type):
    freq = np.asarray(dt.import2dArray(freq_arrays_fn))
    v = TfidfTransformer()
    x = v.fit_transform(freq)
    x = x.toarray()
    dt.write2dArray(x, "../data/"+data_type+"/bow/tfidf/class-all-"+str(lowest_count)+"-"+str(highest_count)+"-"+str(class_type))
    dt.writeClassAll("../data/"+data_type+"/bow/tfidf/class-all-"+str(lowest_count)+"-"+str(highest_count)+"-"+str(class_type),
                     "../data/"+data_type+"/bow/names/"+str(lowest_count)+"-"+str(highest_count)+"-"+str(class_type)+".txt",
                  "../data/"+data_type+"/bow/names/"+str(lowest_count)+"-"+str(highest_count)+"-"+str(class_type)+".txt",
                     "../data/"+data_type+"/bow/tfidf/class-all-"+str(lowest_count)+"-"+str(highest_count)+"-"+str(class_type))
示例#15
0
def pavPPMI(cluster_names_fn,
            ranking_fn,
            file_name,
            do_p=False,
            data_type="movies",
            rewrite_files=False,
            limit_entities=False,
            classification="genres",
            lowest_amt=0,
            highest_amt=2147000000):
    pavPPMI_fn = "../data/" + data_type + "/finetune/" + file_name + ".txt"
    all_fns = [pavPPMI_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", pavPPMI.__name__)
        return
    else:
        print("Running task", pavPPMI.__name__)
    print("certainly still running that old pavPPMI task, yes sir")
    if limit_entities is False:
        classification = "all"

    ranking = dt.import2dArray(ranking_fn)
    names = dt.import1dArray(cluster_names_fn)
    frq = []
    counter = 0

    for name in names:
        name = name.split()[0]
        if ":" in name:
            name = name[:-1]
        frq.append(
            readPPMI(name, data_type, lowest_amt, highest_amt, classification))

    pav_classes = []

    for f in range(len(frq)):
        try:
            print(names[f])
            x = np.asarray(frq[f])
            y = ranking[f]

            ir = IsotonicRegression()
            y_ = ir.fit_transform(x, y)
            pav_classes.append(y_)
            if do_p:
                plot(x, y, y_)
        except ValueError:
            print(names[f], "len ppmi",
                  len(frq[f], "len ranking", len(ranking[f])))
            exit()
        print(f)

    dt.write2dArray(pav_classes, pavPPMI_fn)
    return pav_classes
def selectCutOffByExplanation(cutoff_fn, cluster_dict_fn, file_name):
    cutoff = dt.import2dArray(cutoff_fn)
    dupe_cutoff = copy.deepcopy(cutoff)
    cluster_dict = dt.readArrayDict(cluster_dict_fn)
    cutoff_words = []
    cluster_boundary = 2
    cluster_dict_arrays = []
    for key, value in cluster_dict.items():
        cluster_array = []
        cluster_array.append(key)
        for v in value:
            cluster_array.append(v)
        cluster_dict_arrays.append(cluster_array)
    explanations = []
    explanation_cutoffs = []
    for c in range(len(cutoff)):
        clusters = []
        for i in range(len(cutoff[c])):
            cluster = []
            for x in range(len(cutoff[c]) - 1, -1, -1):
                if cutoff[c][x] is None or cutoff[c][i] is None:
                    continue
                if abs(cutoff[c][i] - cutoff[c][x]) <= cluster_boundary:
                    cluster.append(cluster_dict_arrays[c][x])
                    cutoff[c][x] = None
                    cluster_dict_arrays[c][x] = None
            if cluster is []:
                continue
            clusters.append(cluster)
        # Get the m  vvcaximum similarity word vector value for each cluster, across all clusters
        # For each cluster
        explained_cutoff = []
        explained_cutoff_value = []
        for cl in range(len(clusters)):
            if len(clusters[cl]) == 0:
                print("Skipped")
                continue
            cluster_explanation, winning_index = webapi.getHighestScore(
                clusters[cl])
            explained_cutoff.append(cluster_explanation + ",")

            dict_index = 0
            for h in range(len(cluster_dict_arrays[cl])):
                if cluster_dict_arrays[cl][h] == clusters[cl][winning_index]:
                    dict_index = h
            explained_cutoff_value.append(dupe_cutoff[cl][dict_index])
        explanations.append(explained_cutoff)
        explanation_cutoffs.append(explained_cutoff_value)
    dt.write2dArray(
        explanations,
        "../data/movies/rules/final_names/" + file_name + "WVN.txt")
    dt.write2dArray(explanation_cutoffs,
                    "../data/movies/rules/final_cutoff/" + file_name + ".txt")
def saveClusters(directions_fn,
                 scores_fn,
                 names_fn,
                 filename,
                 amt_of_dirs,
                 data_type,
                 cluster_amt,
                 rewrite_files=False,
                 algorithm="meanshift_k"):

    dict_fn = "../data/" + data_type + "/cluster/dict/" + filename + ".txt"
    cluster_directions_fn = "../data/" + data_type + "/cluster/clusters/" + filename + ".txt"

    all_fns = [dict_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", saveClusters.__name__)
        return
    else:
        print("Running task", saveClusters.__name__)

    p_dir = dt.import2dArray(directions_fn)
    p_names = dt.import1dArray(names_fn, "s")
    p_scores = dt.import1dArray(scores_fn, "f")

    ids = np.argsort(p_scores)

    p_dir = np.flipud(p_dir[ids])[:amt_of_dirs]
    p_names = np.flipud(p_names[ids])[:amt_of_dirs]
    if algorithm == "meanshift":
        labels = meanShift(p_dir)
    else:
        labels = kMeans(p_dir, cluster_amt)
    unique, counts = np.unique(labels, return_counts=True)

    clusters = []
    dir_clusters = []
    for i in range(len(unique)):
        clusters.append([])
        dir_clusters.append([])
    for i in range(len(labels)):
        clusters[labels[i]].append(p_names[i])
        dir_clusters[labels[i]].append(p_dir[i])
    cluster_directions = []
    for l in range(len(dir_clusters)):
        cluster_directions.append(dt.mean_of_array(dir_clusters[l]))

    print("------------------------")
    for c in clusters:
        print(c)
    print("------------------------")

    dt.write2dArray(clusters, dict_fn)
    dt.write2dArray(cluster_directions, cluster_directions_fn)
示例#18
0
def writeFromMultiClass(multi_class_fn, output_folder, entity_names_fn, data_type, classify_name):
    # Get the entities we have phrases for
    entity_names = dt.import1dArray(entity_names_fn)

    # Import multi classes
    multi_class = dt.import1dArray(multi_class_fn)
    class_names = []
    class_val = []
    highest_class = 0

    for line in multi_class:
        cn, cv = re.split(r'\t+', line)
        cv = int(cv)
        class_names.append(cn)
        class_val.append(cv)
        if cv  > highest_class:
            highest_class = cv



    matched_entity_names = list(set(entity_names).intersection(class_names))
    matched_entity_names.sort()
    dt.write1dArray(matched_entity_names, "../data/" + data_type + "/classify/"+classify_name+"/available_entities.txt")


    indexes_to_delete = []

    for n in range(len(class_names)):
        found = False
        for en in range(len(matched_entity_names)):
            if class_names[n] == matched_entity_names[en]:
                found=True
                break
        if found is False:
            indexes_to_delete.append(n)

    class_val = np.delete(class_val, indexes_to_delete)

    classes = []
    print("Found " + str(highest_class) + " classes")
    for e in range(len(matched_entity_names)):
        class_a = [0] * highest_class
        class_a[class_val[e]-1] = 1
        classes.append(class_a)
    dt.write2dArray(classes, "../data/"+data_type+"/classify/"+classify_name+"/class-all")
    print("Wrote class all")
    classes = np.asarray(classes).transpose()


    for cn in range(len(classes)):
        dt.write1dArray(classes[cn], "../data/"+data_type+"/classify/"+classify_name+"/class-"+str(cn))
        print("Wrote", "class-"+str(cn))
示例#19
0
def randomAll(cluster_names_fn, fn):
    all_cluster_output = []
    cluster_names = dt.import1dArray(cluster_names_fn)
    for cn in cluster_names:
        binary = np.asarray(
            dt.import1dArray(
                "../data/movies/bow/frequency/phrases/class-" + cn, "f"))
        random_binary = []
        for b in binary:
            random_binary.append(randint(0, np.amax(binary)))
        all_cluster_output.append(random_binary)
    dt.write2dArray(all_cluster_output,
                    "../data/movies/finetune/" + fn + "RandomAll.txt")
示例#20
0
def trimRankings(rankings_fn, available_indexes_fn, names, folder_name):
    available_indexes = dt.import1dArray(available_indexes_fn)
    rankings = np.asarray(dt.import2dArray(rankings_fn))
    names = dt.import1dArray(names)
    trimmed_rankings = []
    for r in range(len(rankings)):
        trimmed = rankings[r].take(available_indexes)
        trimmed_rankings.append(trimmed)
    for a in range(len(trimmed_rankings)):
        print("Writing", names[a])
        dt.write1dArray(trimmed_rankings[a], folder_name + "class-" + names[a])
    print("Writing", rankings_fn[-6:])
    dt.write2dArray(trimmed_rankings, folder_name + "class-" + rankings_fn[-6:])
示例#21
0
def makeTopVectorsDirections(filename):
    vectors = dt.import2dArray("Directions/" + filename + "Cut.directions")
    top250names = dt.import1dArray("filmdata/top250.txt")
    filmnames = dt.import1dArray("filmdata/filmNames.txt")

    top250vectors = []

    for f in range(len(filmnames)):
        for t in range(len(top250names)):
            if filmnames[f] == top250names[t]:
                top250vectors.append(vectors[t])

    dt.write2dArray(top250vectors,
                    "../data/movies/plot/t250" + filename + ".directions")
示例#22
0
def avgPPMI(cluster_names_fn,
            ranking_fn,
            file_name,
            do_p=False,
            data_type="movies",
            rewrite_files=False,
            classification="genres",
            lowest_amt=0,
            highest_amt=2147000000,
            limit_entities=False,
            save_results_so_far=False):
    pavPPMI_fn = "../data/" + data_type + "/finetune/" + file_name + ".txt"
    all_fns = [pavPPMI_fn]
    if dt.allFnsAlreadyExist(
            all_fns) and not rewrite_files or save_results_so_far:
        print("Skipping task", avgPPMI.__name__)
        return
    else:
        print("Running task", avgPPMI.__name__)

    if limit_entities is False:
        classification = "all"

    ranking = dt.import2dArray(ranking_fn)
    names = dt.import2dArray(cluster_names_fn, "s")

    for n in range(len(names)):
        for x in range(len(names[n])):
            if ":" in names[n][x]:
                names[n][x] = names[n][x][:-1]

    frq = []
    counter = 0

    for n in range(len(names)):
        name_frq = []
        for name in names[n]:
            name_frq.append(
                readPPMI(name, data_type, lowest_amt, highest_amt,
                         classification))
        avg_frq = []
        name_frq = np.asarray(name_frq).transpose()
        for name in name_frq:
            avg_frq.append(np.average(name))
        frq.append(np.asarray(avg_frq))
        print(n)

    dt.write2dArray(frq, pavPPMI_fn)
    return frq
示例#23
0
def binaryInCluster(cluster_dict_fn, fn):
    cluster = dt.readArrayDict(cluster_dict_fn)
    all_cluster_output = []
    for key, items in cluster.items():
        init_binary = dt.import1dArray(
            "../data/movies/bow/binary/phrases/" + key, "i")
        for i in items:
            binary = dt.import1dArray("../data/movies/bow/binary/phrases/" + i,
                                      "i")
            for j in range(len(init_binary)):
                if binary[j] == 1:
                    init_binary[j] = 1
        all_cluster_output.append(init_binary)
    dt.write2dArray(all_cluster_output,
                    "../data/movies/finetune/" + fn + "InCluster.txt")
示例#24
0
def main(data_type, clf, highest_amt, lowest_amt, depth, rewrite_files):

    min = lowest_amt
    max = highest_amt
    dm_fn = "../data/" + data_type + "/mds/class-all-" + str(min) + "-" + str(max) \
                    + "-" + clf  + "dm"
    dm_shorten_fn = "../data/" + data_type + "/mds/class-all-" + str(min) + "-" + str(max) \
                    + "-" + clf  + "dmround"
    mds_fn = "../data/"+data_type+"/mds/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf+ "d" + str(depth)
    svd_fn = "../data/"+data_type+"/svd/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf + "d" + str(depth)
    pca_fn = "../data/"+data_type+"/pca/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf + "d" + str(depth)
    shorten_fn = "../data/" + data_type + "/bow/ppmi/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf+ "round"

    term_frequency_fn = init_vector_path = "../data/" + data_type + "/bow/ppmi/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf
    if dt.allFnsAlreadyExist([dm_fn, mds_fn, svd_fn, shorten_fn]):
        print("all files exist")
        exit()
    if dt.fileExists(dm_fn) is False:
        newsgroups_train = fetch_20newsgroups(subset='train', shuffle=False)
        newsgroups_test = fetch_20newsgroups(subset='test', shuffle=False)


        vectors = np.concatenate((newsgroups_train.data, newsgroups_test.data), axis=0)
        newsgroups_test = None
        newsgroups_train = None
        # Get sparse tf rep
        tf_vectorizer = CountVectorizer(max_df=highest_amt, min_df=lowest_amt, stop_words='english')
        print("completed vectorizer")
        tf = tf_vectorizer.fit_transform(vectors)
        vectors = None
        # Get sparse PPMI rep from sparse tf rep
        print("done ppmisaprse")
        sparse_ppmi = convertPPMISparse(tf)
        # Get sparse Dsim matrix from sparse PPMI rep
        dm = getDissimilarityMatrixSparse(sparse_ppmi)
        dt.write2dArray(dm, dm_fn)
    else:
        dm = dt.import2dArray(dm_fn)
    print("starting mds")
    # Use as input to mds
    mds = createMDS(dm, depth)
    # save MDS
    dt.write2dArray(mds, mds_fn)
示例#25
0
def bagOfClustersPavPPMI(cluster_names_fn,
                         ranking_fn,
                         file_name,
                         do_p=False,
                         data_type="movies",
                         rewrite_files=False,
                         limit_entities=False,
                         classification="genres",
                         lowest_amt=0,
                         highest_amt=2147000000,
                         sparse_freqs_fn=None,
                         bow_names_fn=None):

    pavPPMI_fn = "../data/" + data_type + "/finetune/boc/" + file_name + ".txt"
    all_fns = [pavPPMI_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", bagOfClustersPavPPMI.__name__)
        return
    else:
        print("Running task", bagOfClustersPavPPMI.__name__)

    if limit_entities is False:
        classification = "all"

    bow_names = dt.import1dArray(bow_names_fn, "s")
    sparse_freqs = dt.import2dArray(sparse_freqs_fn, return_sparse=True)
    ranking = dt.import2dArray(ranking_fn)
    cluster_names = dt.import2dArray(cluster_names_fn, "s")

    frq = getLROnBag(cluster_names, data_type, lowest_amt, highest_amt,
                     classification, file_name, bow_names, sparse_freqs)

    pav_classes = []

    for f in range(len(frq)):
        print(cluster_names[f])
        x = np.asarray(frq[f])
        y = ranking[f]

        ir = IsotonicRegression()
        y_ = ir.fit_transform(x, y)
        pav_classes.append(y_)
        if do_p:
            plot(x, y, y_)
        print(f)

    dt.write2dArray(pav_classes, pavPPMI_fn)
    return pav_classes
示例#26
0
def maxNonZero(cluster_names_fn, fn):
    all_cluster_output = []
    cluster_names = dt.import1dArray(cluster_names_fn)
    for cn in cluster_names:
        binary = np.asarray(
            dt.import1dArray(
                "../data/movies/bow/frequency/phrases/class-" + cn, "f"))
        random_binary = []
        for b in binary:
            if b > 0:
                random_binary.append(np.amax(binary))
            else:
                random_binary.append(0)
        all_cluster_output.append(random_binary)
    dt.write2dArray(all_cluster_output,
                    "../data/movies/finetune/" + fn + "MaxNonZero.txt")
示例#27
0
def getDissimilarityMatrixSparse(tf):
    tflen = tf.shape[0]
    dm = np.empty([tflen, tflen], dtype="float64")
    pithing = 2/pi
    norms = np.empty(tflen, dtype="float64")

    #Calculate norms
    for ei in range(tflen):
        norms[ei] = spl.norm(tf[ei])
        print("norm", ei)

    dot_product = np.zeros([tflen, tflen], dtype="float64")

    use_old_dp = True
    if use_old_dp:
       dot_product = dt.import2dArray("dotproduct.temp")
    else:
        #Calculate dot products
        for ei in range(tflen):
            for ej in range(tflen):
                if dot_product[ej][ei] != 0:
                    dot_product[ei][ej] = dot_product[ej][ei]
                    continue
                dot_product[ei][ej] = tf[ei].dot(tf[ej].T)[0,0]
            print("dp", ei)
        dt.write2dArray(dot_product, "dotproduct.temp")

    norm_multiplied = np.empty([tflen, tflen], dtype="float64")

    # Calculate dot products
    for ei in range(tflen):
        for ej in range(tflen):
            norm_multiplied[ei][ej] = norms[ei] * norms[ej]
        print("dp", ei)

    norm_multiplied = dt.shortenFloatsNoFn(norm_multiplied)
    dot_product = dt.shortenFloatsNoFn(dot_product)

    #Get angular differences
    for ei in range(tflen):
        for ej in range(tflen):
            ang = pithing * np.arccos(dot_product[ei][ej] / norm_multiplied[ei][ej])
            dm[ei][ej] = ang
        print(ei)
    return dm
示例#28
0
def main(min, max, data_type, raw_fn, extension, cut_first_line, additional_name, make_individual, entity_name_fn,
         use_all_files, sparse_matrix, word_count_amt, classification):

    getVectors(raw_fn, entity_name_fn, extension, "../data/"+data_type+"/bow/",
           min, max, cut_first_line, get_all, additional_name,  make_individual, classification, use_all_files, 1000, data_type,
               sparse_matrix)

    bow = sp.csr_matrix(dt.import2dArray("../data/"+data_type+"/bow/frequency/phrases/class-all-"+str(min)+"-" + str(max)+"-"+classification))
    dt.write2dArray(convertPPMI( bow), "../data/"+data_type+"/bow/ppmi/class-all-"+str(min)+"-"+str(max)+"-" + classification)

    print("indiviual from all")
    printIndividualFromAll(data_type, "ppmi", min, max,  classification)

    printIndividualFromAll(data_type, "binary/phrases", min, max,  classification)

    convertToTfIDF(data_type, min, max, "../data/"+data_type+"/bow/frequency/phrases/class-all-"+str(min)+"-"+str(max)+"-"+classification, classification)

    printIndividualFromAll(data_type, "tfidf", min, max,  classification)
示例#29
0
def getAllPhraseRankings(directions_fn=None,
                         vectors_fn=None,
                         property_names_fn=None,
                         vector_names_fn=None,
                         fn="no filename",
                         percentage_increment=1,
                         scores_fn=None,
                         top_amt=0,
                         discrete=False,
                         data_type="movies",
                         rewrite_files=False):
    rankings_fn_all = "../data/" + data_type + "/rank/numeric/" + fn + "ALL.txt"

    all_fns = [rankings_fn_all]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", "getAllPhraseRankings")
        return
    else:
        print("Running task", "getAllPhraseRankings")

    directions = dt.import2dArray(directions_fn)
    vectors = dt.import2dArray(vectors_fn)
    property_names = dt.import1dArray(property_names_fn)
    vector_names = dt.import1dArray(vector_names_fn)
    if top_amt != 0:
        scores = dt.import1dArray(scores_fn, "f")
        directions = dt.sortByReverseArray(directions, scores)[:top_amt]
        property_names = dt.sortByReverseArray(property_names,
                                               scores)[:top_amt]

    rankings = getRankings(directions, vectors, property_names, vector_names)
    if discrete:
        discrete_labels = createDiscreteLabels(rankings, percentage_increment)
        discrete_labels = np.asarray(discrete_labels)
    for a in range(len(rankings)):
        rankings[a] = np.around(rankings[a], decimals=4)
    #dt.write1dArray(property_names, "../data/movies/bow/names/top5kof17k.txt")

    dt.write2dArray(rankings, rankings_fn_all)
def getCutOff(cluster_dict_fn, rankings_fn, file_name):

    cluster_dict = dt.readArrayDict(cluster_dict_fn)
    rankings = dt.importDiscreteVectors(rankings_fn)

    for r in rankings:
        for a in range(len(r)):
            r[a] = int(r[a][:-1])

    cutoff_clusters = []
    counter = 0
    for key, value in cluster_dict.items():
        value.insert(0, key)
        cutoffs = []
        for v in value:
            max_score = 0
            cutoff = 0
            for i in range(1, 101):
                y_pred = []
                for ve in range(len(rankings[counter])):
                    rank = rankings[counter][ve]
                    if rank > i:
                        y_pred.append(0)
                    else:
                        y_pred.append(1)
                y_test = dt.import2dArray(
                    "../data/movies/bow/frequency/phrases/class-" + v, "s")
                score = cohen_kappa_score(y_test, y_pred)
                print(v, int(i), "Score", score)
                if score > max_score:
                    max_score = score
                    cutoff = i
            cutoffs.append(cutoff)
            print("Cutoff for", v, "On", key, "Was", str(cutoff))
        cutoff_clusters.append(cutoffs)
        counter += 1
    dt.write2dArray(cutoff_clusters,
                    "../data/movies/rules/cutoff/" + file_name + ".txt")