예제 #1
0
def LDA(tf, names, components, file_name, doc_topic_prior, topic_word_prior,
        data_type, rewrite_files):
    # Removed model name as it was unused and I manually renamed a bunch of files and was too lazy to do model too
    rep_name = "../data/" + data_type + "/LDA/rep/" + file_name + ".txt"
    model_name = "../data/" + data_type + "/LDA/model/" + file_name + ".txt"
    names_name = "../data/" + data_type + "/LDA/names/" + file_name + ".txt"

    all_names = [rep_name, names_name]

    if dt.allFnsAlreadyExist(all_names) and not rewrite_files:
        print("Already completed")
        return
    print(len(tf), print(len(tf[0])))

    print("Fitting LDA models with tf features,")
    lda = LatentDirichletAllocation(doc_topic_prior=doc_topic_prior,
                                    topic_word_prior=topic_word_prior,
                                    n_topics=components)
    t0 = time()
    tf = np.asarray(tf).transpose()
    new_rep = lda.fit_transform(tf)
    print("done in %0.3fs." % (time() - t0))

    print("\nTopics in LDA model:")
    topics = print_top_words(lda, names)
    topics.reverse()
    dt.write1dArray(
        topics, "../data/" + data_type + "/LDA/names/" + file_name + ".txt")
    dt.write2dArray(new_rep.transpose(), rep_name)
    joblib.dump(lda, model_name)
예제 #2
0
def makeTopVectors(filename):

    vectors = dt.import2dArray("Rankings/" + filename + ".space")
    top250names = dt.import1dArray("filmdata/top250.txt")
    film_names = dt.import1dArray("filmdata/filmNames.txt")

    indexes = []
    ordered_names = []
    for f in range(len(film_names)):
        for t in top250names:
            if film_names[f] == t:
                indexes.append(f)
                ordered_names.append(t)

    top_vectors = [[]]
    for v in range(len(vectors)):
        if v > 0:
            top_vectors.append([])
        for i in range(len(vectors[v])):
            for id in indexes:
                if i == id:
                    top_vectors[v].append(vectors[v][i])

    dt.write2dArray(top_vectors, "Plots/Top174" + filename + ".space")
    dt.write1dArray(ordered_names, "Plots/Top174OrderedByOriginalList.txt")
예제 #3
0
def main(data_type, vector_size, window_size, min_count, sampling_threshold, negative_size,
                               train_epoch, dm, worker_count, train_wv, concatenate_wv, use_hierarchical_softmax):
    file_name = "Doc2Vec" + " VS" + str(vector_size) + " WS" + str(window_size) + " MC" + str(min_count) + " ST" + str(
        sampling_threshold) + \
                " NS" + str(negative_size) + " TE" + str(train_epoch) + " DM" + str(dm) + " WC" + str(
        worker_count) + "spacy"
    " NS" + str(negative_size) + " TE" + str(train_epoch) + " DM" + str(dm) + " WC" + str(worker_count) + \
    " TW" + str(train_wv) + " CW" + str(concatenate_wv) + " HS" + str(use_hierarchical_softmax)

    corpus_fn = "../data/raw/" + data_type + "/corpus_processed.txt"

    if os.path.exists(corpus_fn) is False:
        x_train = np.load("../data/raw/" + data_type + "/x_train_w.npy")
        x_test = np.load("../data/raw/" + data_type + "/x_test_w.npy")
        corpus = np.concatenate((x_train, x_test), axis=0)
        text_corpus = np.empty(len(corpus), dtype=np.object)
        for i in range(len(corpus)):
            text_corpus[i] = " ".join(corpus[i])
            print(text_corpus[i])
        dt.write1dArray(text_corpus, corpus_fn)

    embedding_fn = "/home/tom/Downloads/glove.6B/glove.6B.300d.txt"

    model_fn = "../data/" + data_type + "/doc2vec/" + file_name + ".bin"
    vector_fn = "../data/" + data_type + "/nnet/spaces/" + file_name + ".npy"
    score_fn = "../data/" + data_type + "/doc2vec/" + file_name + "catacc.score"

    if os.path.exists(model_fn):
        print("Imported model")
        model = g.utils.SaveLoad.load(model_fn)
    elif file_name[:7] == "Doc2Vec":
        model = doc2Vec(embedding_fn, corpus_fn, vector_size, window_size, min_count, sampling_threshold,
                        negative_size, train_epoch, dm, worker_count, train_wv, concatenate_wv, use_hierarchical_softmax)
        model.save(model_fn)

    if os.path.exists(vector_fn) is False:
        vectors = []
        for d in range(len(model.docvecs)):
            vectors.append(model.docvecs[d])
        np.save(vector_fn, vectors)
    else:
        print("Imported vectors")
        vectors = np.load(vector_fn)

    if os.path.exists(score_fn) is False or file_name[:6] != "Doc2Vec":
        print("Getting score")
        if data_type == "sentiment":
            classes = dt.import1dArray("../data/" + data_type + "/classify/" + data_type + "/class-all", "i")
            x_train, y_train, x_test, y_test = sentiment.getSplits(vectors, classes)
            scores = linearSVMScore(x_train, y_train, x_test, y_test)
        else:
            classes = dt.import2dArray("../data/" + data_type + "/classify/" + data_type + "/class-all", "i")
            x_train, y_train, x_test, y_test = newsgroups.getSplits(vectors, classes)
            scores = multiClassLinearSVM(x_train, y_train, x_test, y_test)
        print(scores)
        dt.write1dArray(scores, score_fn)
예제 #4
0
def convertEntityNamesToIDS(ID_fn, all_names_fn, individual_names_fn, output_fn):
    ID_fn = dt.import1dArray(ID_fn)
    all_names_fn = dt.import1dArray(all_names_fn)
    individual_names_fn = dt.import1dArray(individual_names_fn)
    indexes = []

    for n in range(len(all_names_fn)):
        for name in individual_names_fn:
            if all_names_fn[n] == name:
                indexes.append(n)
    dt.write1dArray(np.asarray(ID_fn)[indexes], output_fn)
def main(data_type):

    if data_type == "newsgroups":
        corpus = fetch_20newsgroups(subset='all',
                                    shuffle=False,
                                    remove=("headers", "footers",
                                            "quotes")).data
        tokenized_corpus, text_corpus = tokenizeLowercaseSpacy(corpus)
        np.save("../data/raw/newsgroups/corpus.npy", tokenized_corpus)
        dt.write1dArray(text_corpus,
                        "../data/raw/newsgroups/corpus_processed.txt")
예제 #6
0
def getAvailableEntities(entity_names_fns, data_type, classification):
    entity_names = []
    for e in entity_names_fns:
        entity_names.append(dt.import1dArray(e))
    dict = {}
    for entity_name in entity_names:
        for name in entity_name:
            dict[name] = 0
    available_entities = []
    for key in dict:
        available_entities.append(key)
    dt.write1dArray(available_entities, "../data/"+data_type+"/classify/"+classification+"available_entities.txt")
예제 #7
0
def writeFromMultiClass(multi_class_fn, output_folder, entity_names_fn, data_type, classify_name):
    # Get the entities we have phrases for
    entity_names = dt.import1dArray(entity_names_fn)

    # Import multi classes
    multi_class = dt.import1dArray(multi_class_fn)
    class_names = []
    class_val = []
    highest_class = 0

    for line in multi_class:
        cn, cv = re.split(r'\t+', line)
        cv = int(cv)
        class_names.append(cn)
        class_val.append(cv)
        if cv  > highest_class:
            highest_class = cv



    matched_entity_names = list(set(entity_names).intersection(class_names))
    matched_entity_names.sort()
    dt.write1dArray(matched_entity_names, "../data/" + data_type + "/classify/"+classify_name+"/available_entities.txt")


    indexes_to_delete = []

    for n in range(len(class_names)):
        found = False
        for en in range(len(matched_entity_names)):
            if class_names[n] == matched_entity_names[en]:
                found=True
                break
        if found is False:
            indexes_to_delete.append(n)

    class_val = np.delete(class_val, indexes_to_delete)

    classes = []
    print("Found " + str(highest_class) + " classes")
    for e in range(len(matched_entity_names)):
        class_a = [0] * highest_class
        class_a[class_val[e]-1] = 1
        classes.append(class_a)
    dt.write2dArray(classes, "../data/"+data_type+"/classify/"+classify_name+"/class-all")
    print("Wrote class all")
    classes = np.asarray(classes).transpose()


    for cn in range(len(classes)):
        dt.write1dArray(classes[cn], "../data/"+data_type+"/classify/"+classify_name+"/class-"+str(cn))
        print("Wrote", "class-"+str(cn))
예제 #8
0
def trimRankings(rankings_fn, available_indexes_fn, names, folder_name):
    available_indexes = dt.import1dArray(available_indexes_fn)
    rankings = np.asarray(dt.import2dArray(rankings_fn))
    names = dt.import1dArray(names)
    trimmed_rankings = []
    for r in range(len(rankings)):
        trimmed = rankings[r].take(available_indexes)
        trimmed_rankings.append(trimmed)
    for a in range(len(trimmed_rankings)):
        print("Writing", names[a])
        dt.write1dArray(trimmed_rankings[a], folder_name + "class-" + names[a])
    print("Writing", rankings_fn[-6:])
    dt.write2dArray(trimmed_rankings, folder_name + "class-" + rankings_fn[-6:])
예제 #9
0
def getNDCG(rankings_fn,
            fn,
            data_type,
            bow_fn,
            ppmi_fn,
            lowest_count,
            rewrite_files=False,
            highest_count=0,
            classification=""):

    # Check if the NDCG scores have already been calculated, if they have then skip.
    ndcg_fn = "../data/" + data_type + "/ndcg/" + fn + ".txt"

    all_fns = [ndcg_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", getNDCG.__name__)
        return
    else:
        print("Running task", getNDCG.__name__)

    # Get the file names for the PPMI values for every word and a list of words ("names")
    names = dt.import1dArray("../data/" + data_type + "/bow/names/" + bow_fn)
    ppmi = dt.import2dArray("../data/" + data_type + "/bow/ppmi/" + ppmi_fn)
    # Process the rankings and the PPMI line-by-line so as to not run out of memory
    ndcg_a = []
    #spearman_a = []
    with open(rankings_fn) as rankings:
        r = 0
        for lr in rankings:
            for lp in ppmi:
                # Get the plain-number ranking of the rankings, e.g. "1, 4, 3, 50"
                sorted_indices = np.argsort(
                    list(map(float,
                             lr.strip().split())))[::-1]
                # Convert PPMI scores to floats
                # Get the NDCG score for the PPMI score, which is a valuation, compared to the indice of the rank
                ndcg = ndcg_from_ranking(lp, sorted_indices)

                # Add to array and print
                ndcg_a.append(ndcg)
                print("ndcg", ndcg, names[r], r)
                """
                    smr = spearmanr(ppmi_indices, sorted_indices)[1]
                    spearman_a.append(smr)
                    print("spearman", smr, names[r], r)
                    """
                r += 1
                break
    # Save NDCG
    dt.write1dArray(ndcg_a, ndcg_fn)
예제 #10
0
 def get_code(self, tree, feature_names, class_names, filename, data_type):
     rules_array = []
     dt.write1dArray(
         rules_array,
         "../data/" + data_type + "/rules/text_rules/" + filename + ".txt")
     # Probably not needed
     cleaned = jsbeautifier.beautify_file("../data/" + data_type +
                                          "/rules/text_rules/" + filename +
                                          ".txt")
     file = open(
         "../data/" + data_type + "/rules/text_rules/" + filename + ".txt",
         "w")
     file.write(cleaned)
     file.close()
예제 #11
0
def getSimilarClusters(cluster_dict_1, cluster_dict_2, trim_amt, file_name, data_type, threshold_for_stopping, threshold_for_stopping_1):
    matching_clusters = np.zeros(len(cluster_dict_1), dtype=np.int32)
    new_cluster_dict_2 = []
    for c in cluster_dict_2:
        new_cluster_dict_2.append(np.flipud(c))
    cluster_dict_2 = None
    cluster_dict_2 = new_cluster_dict_2
    positions = np.zeros(len(cluster_dict_1))
    for c in range(len(cluster_dict_1)):
        print(c)
        lowest_pos = 242343
        lowest_cluster = len(cluster_dict_2)-1
        for n in range(len(cluster_dict_1[c])):
            if n > threshold_for_stopping_1:
                break
            name_to_match = cluster_dict_1[c][n]
            if ":" in name_to_match:
                name_to_match = name_to_match[:-1]
            for c2 in range(len(cluster_dict_2)):
                for n2 in range(len(cluster_dict_2[c2])):
                    if n2 > threshold_for_stopping:
                        break
                    name_to_match2 = cluster_dict_2[c2][n2]
                    if ":" in name_to_match2:
                        name_to_match2 = name_to_match2[:-1]
                    if name_to_match == name_to_match2:
                        if n2 < lowest_pos:
                            lowest_cluster = c2
                            lowest_pos = n2
                            break
            matching_clusters[c] = lowest_cluster
            positions[c] = lowest_pos
    sorted_matching_indexes = matching_clusters[np.argsort(positions)]
    sorted_orig_indexes = np.asarray(list(range(len(cluster_dict_1))))[np.argsort(positions)]
    print("_--------------------------------------------------")
    print("SORTED")
    print("_--------------------------------------------------")
    lines = []
    for c in range(len(sorted_orig_indexes)):
        line_p1 = ""
        for n in cluster_dict_1[sorted_orig_indexes[c]][:trim_amt]:
            line_p1 = line_p1 + n + " "
        line_pl2 = ""
        for k in cluster_dict_2[sorted_matching_indexes[c]][:trim_amt]:
            line_pl2 = line_pl2 + k + " "
        line =  line_p1 + " |||| " + line_pl2
        lines.append(line)
        print(line)
    dt.write1dArray(lines, "../data/" + data_type + "/investigate/" + file_name + str(trim_amt) + ".txt")
예제 #12
0
def printIndividualFromAll(data_type, type, lowest_count, max,  classification, all_fn=None, names_array = None):
    fn = "../data/" + data_type + "/bow/"
    if all_fn is None:
        all_fn = fn + type + "/class-all-"+str(lowest_count)+"-"+str(max)+"-"+str(classification)
    if names_array is None:
        names = dt.import1dArray(fn + "names/"+str(lowest_count)+"-"+str(max)+"-"+str(classification)+".txt")
    else:
        names = names_array
    with open(all_fn) as all:
        c = 0
        for la in all:
            convert = dt.convertLine(la)
            dt.write1dArray(convert, fn+ type+"/class-"+str(names[c]+"-"+str(lowest_count)+"-"+str(max)+"-"+str(classification)))
            print(c, len(names), names[c])
            c+=1
    print("wrote individual from all")
예제 #13
0
def getTop250Movies(entity_names):
    imdb = dt.import1dArray("../data/raw/imdb/ratings/ratings.list")[28:278]
    orig_en = entity_names
    for e in range(len(entity_names)):
        entity_names[e] = "".join(entity_names[e].split()[:-1])
        entity_names[e] = dt.removeEverythingFromString(entity_names[e])
    top_en = []

    for string in imdb:
        string =string.split(".")[1][1:]
        string =string.split()[:-1]
        string = " ".join(string)
        string = dt.removeEverythingFromString(string)
        top_en.append(string)
    matched_index = []
    for e in range(len(entity_names)):
        for x in range(len(top_en)):
            if entity_names[e] == top_en[x]:
                matched_index.append(e)
                print(entity_names[e])
                break
    dt.write1dArray(matched_index, "../data/movies/top_imdb_indexes.txt")
예제 #14
0
def writeClassesFromNames(folder_name, file_names, output_folder):
    names = dt.getFolder(folder_name)
    all_names = defaultdict(int)
    entity_names = dt.import1dArray(file_names)
    translator = str.maketrans({key: None for key in string.punctuation})

    for type in range(len(names)):
        for n in range(len(names[type])):
            names[type][n] = dt.removeEverythingFromString(names[type][n])
            all_names[names[type][n]] += 1
    available_class_names = []
    available_indexes = []
    for n in range(len(entity_names)):
        name = entity_names[n]
        original_name = name
        name = dt.removeEverythingFromString(name)
        if all_names[name] > 0:
            available_class_names.append(original_name)
            available_indexes.append(n)
            print(name, "exists")
        else:
            print(name, "FAIL")
    dt.write1dArray(available_indexes, output_folder + "available_indexes.txt")
    dt.write1dArray(available_class_names, output_folder + "available_entities.txt")
    print("Wrote available indexes and entities")
    class_all = []
    for c in range(len(names)):
        binary_class = []
        for n in range(len(available_class_names)):
            available_class_names[n] = dt.removeEverythingFromString(available_class_names[n])
            if available_class_names[n] in names[c]:
                binary_class.append(1)
            else:
                binary_class.append(0)
        dt.write1dArray(binary_class, output_folder + "class-"+str(c)+"")
        class_all.append(binary_class)
    dt.write2dArray(np.asarray(class_all).transpose(), output_folder + "class-all")
    print("Wrote class-all")
예제 #15
0
                for n in range(len(entities_unique)):
                    if clean_ent_unique[n] == clean_us_ent[i]:
                        new_class_all[a+4][n] = 1
                        break

    names = ["UK-PG",
    "UK-12-12A",
    "UK-15",
    "UK-18",
    "USA-G",
    "USA-PG-PG13",
    "USA-R"
    ]

    for i in range(len(new_class_all)):
        dt.write1dArray(new_class_all[i], "../data/movies/classify/ratings/class-" + names[i])

    new_class_all = np.asarray(new_class_all).transpose()

    dt.write2dArray(new_class_all, "../data/movies/classify/ratings/class-all")
    dt.write1dArray(entities_unique, "../data/movies/classify/ratings/available_entities.txt")
    """
    get_all = False
    additional_name = ""
    #make_individual = True
    make_individual = False
    sparse_matrix = False
    print("??")

    class_type = "movies"
    classification = "all"
예제 #16
0
def removeClass(array_fn):
    array = dt.import1dArray(array_fn)
    for e in range(len(array)):
        array[e] = array[e][6:]
    dt.write1dArray(array, array_fn)
예제 #17
0
    def __init__(self,
                 vector_path,
                 class_path,
                 property_names_fn,
                 file_name,
                 svm_type,
                 training_size=10000,
                 lowest_count=200,
                 highest_count=21470000,
                 get_kappa=True,
                 get_f1=True,
                 single_class=True,
                 data_type="movies",
                 getting_directions=True,
                 threads=1,
                 chunk_amt=0,
                 chunk_id=0,
                 rewrite_files=False,
                 classification="all",
                 loc="../data/"):

        self.get_kappa = True
        self.get_f1 = get_f1
        self.data_type = data_type
        self.classification = classification
        self.lowest_amt = lowest_count
        self.higher_amt = highest_count

        if chunk_amt > 0:
            file_name = file_name + " CID" + str(chunk_id) + " CAMT" + str(
                chunk_amt)

        directions_fn = loc + data_type + "/svm/directions/" + file_name + ".txt"
        ktau_scores_fn = loc + data_type + "/svm/f1/" + file_name + ".txt"
        kappa_fn = loc + data_type + "/svm/kappa/" + file_name + ".txt"
        acc_fn = loc + data_type + "/svm/acc/" + file_name + ".txt"

        all_fns = [directions_fn, kappa_fn]
        if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
            print("Skipping task", "getSVMResults")
            return
        else:
            print("Running task", "getSVMResults")

        y_train = 0
        y_test = 0
        vectors = np.asarray(dt.import2dArray(vector_path))
        print("imported vectors")
        if not getting_directions:
            classes = np.asarray(dt.import2dArray(class_path))
            print("imported classes")
        property_names = dt.import1dArray(property_names_fn)
        print("imported propery names")
        if chunk_amt > 0:
            if chunk_id == chunk_amt - 1:
                chunk = int(len(property_names) / chunk_amt)
                multiply = chunk_amt - 1
                property_names = property_names[chunk * multiply:]
            else:
                property_names = dt.chunks(
                    property_names, int(
                        (len(property_names) / chunk_amt)))[chunk_id]

        if not getting_directions:
            x_train, x_test, y_train, y_test = train_test_split(vectors,
                                                                classes,
                                                                test_size=0.3,
                                                                random_state=0)
        else:
            x_train = vectors
            x_test = vectors

        if get_f1:
            y_train = y_train.transpose()
            y_test = y_test.transpose()
            print("transpoosed")
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test

        if self.get_f1 is False:
            print("running svms")
            kappa_scores, directions, ktau_scores, property_names = self.runAllSVMs(
                y_test, y_train, property_names, file_name, svm_type,
                getting_directions, threads)

            dt.write1dArray(kappa_scores, kappa_fn)
            dt.write2dArray(directions, directions_fn)
            dt.write1dArray(ktau_scores, ktau_scores_fn)
            dt.write1dArray(property_names,
                            property_names_fn + file_name + ".txt")
        else:
            final_f1 = []
            final_acc = []
            for y in range(len(y_train)):
                f1, acc = self.runClassifySVM(y_test[y], y_train[y])
                print(f1, acc)
                final_f1.append(f1)
                final_acc.append(acc)
            dt.write1dArray(final_f1, ktau_scores_fn)
            dt.write1dArray(final_acc, acc_fn)
예제 #18
0
    def __init__(self,
                 class_path=None,
                 get_scores=False,
                 randomize_finetune_weights=False,
                 dropout_noise=None,
                 amount_of_hidden=0,
                 epochs=1,
                 learn_rate=0.01,
                 loss="mse",
                 batch_size=1,
                 past_model_bias_fn=None,
                 identity_swap=False,
                 reg=0.0,
                 amount_of_finetune=[],
                 output_size=25,
                 hidden_activation="tanh",
                 layer_init="glorot_uniform",
                 output_activation="tanh",
                 deep_size=None,
                 corrupt_finetune_weights=False,
                 split_to_use=-1,
                 hidden_layer_size=100,
                 file_name="unspecified_filename",
                 vector_path=None,
                 is_identity=False,
                 finetune_size=0,
                 data_type="movies",
                 optimizer_name="rmsprop",
                 noise=0.0,
                 fine_tune_weights_fn=None,
                 past_model_weights_fn=None,
                 from_ae=True,
                 save_outputs=False,
                 label_names_fn="",
                 rewrite_files=False,
                 cv_splits=1,
                 cutoff_start=0.2,
                 development=False,
                 class_weight=None,
                 csv_fn=None,
                 tune_vals=False,
                 get_nnet_vectors_path=None,
                 classification_name="all",
                 limit_entities=False,
                 limited_label_fn="",
                 vector_names_fn="",
                 identity_activation="linear",
                 loc="../data/",
                 lock_weights_and_redo=False):

        total_file_name = loc + data_type + "/nnet/spaces/" + file_name
        weights_fn = loc + data_type + "/nnet/weights/" + file_name + "L0.txt"
        bias_fn = loc + data_type + "/nnet/bias/" + file_name + "L0.txt"
        rank_fn = loc + data_type + "/nnet/clusters/" + file_name + ".txt"

        all_fns = [weights_fn, bias_fn, rank_fn]
        if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
            print("Skipping task", "nnet")
            return
        else:

            print("Running task", "nnet")

        self.class_path = class_path
        self.learn_rate = learn_rate
        self.epochs = epochs
        self.loss = loss
        self.batch_size = batch_size
        self.hidden_activation = hidden_activation
        self.layer_init = layer_init
        self.output_activation = output_activation
        self.hidden_layer_size = hidden_layer_size
        self.file_name = file_name
        self.vector_path = vector_path
        self.dropout_noise = dropout_noise
        self.finetune_size = finetune_size
        self.get_scores = get_scores
        self.reg = reg
        self.amount_of_finetune = amount_of_finetune
        self.amount_of_hidden = amount_of_hidden
        self.output_size = output_size
        self.identity_swap = identity_swap
        self.deep_size = deep_size
        self.from_ae = from_ae
        self.is_identity = is_identity
        self.randomize_finetune_weights = randomize_finetune_weights
        self.corrupt_finetune_weights = corrupt_finetune_weights
        self.deep_size = deep_size
        self.fine_tune_weights_fn = fine_tune_weights_fn
        self.identity_activation = identity_activation
        self.lock_weights_and_redo = lock_weights_and_redo

        print(data_type)

        if optimizer_name == "adagrad":
            self.optimizer = Adagrad()
        elif optimizer_name == "sgd":
            self.optimizer = SGD()
        elif optimizer_name == "rmsprop":
            self.optimizer = RMSprop()
        elif optimizer_name == "adam":
            self.optimizer = Adam()
        elif optimizer_name == "adadelta":
            self.optimizer = Adadelta()
        else:
            print("optimizer not found")
            exit()

        entity_vectors = np.asarray(dt.import2dArray(self.vector_path))
        print("Imported vectors", len(entity_vectors), len(entity_vectors[0]))

        if get_nnet_vectors_path is not None:
            nnet_vectors = np.asarray(dt.import2dArray(get_nnet_vectors_path))
            print("Imported vectors", len(entity_vectors),
                  len(entity_vectors[0]))

        entity_classes = np.asarray(dt.import2dArray(self.class_path))
        print("Imported classes", len(entity_classes), len(entity_classes[0]))

        if fine_tune_weights_fn is None:
            vector_names = dt.import1dArray(vector_names_fn)
            limited_labels = dt.import1dArray(limited_label_fn)
            entity_vectors = np.asarray(
                dt.match_entities(entity_vectors, limited_labels,
                                  vector_names))

        if fine_tune_weights_fn is not None:
            if len(entity_vectors) != len(entity_classes):
                entity_classes = entity_classes.transpose()
                print("Transposed classes, now in form", len(entity_classes),
                      len(entity_classes[0]))
                """
                # IF Bow
                if len(entity_vectors[0]) != len(entity_classes[0]):
                    entity_vectors = entity_vectors.transpose()
                    print("Transposed vectors, now in form", len(entity_vectors), len(entity_vectors[0]))
                """
        elif len(entity_vectors) != len(entity_classes):
            entity_vectors = entity_vectors.transpose()
            print("Transposed vectors, now in form", len(entity_vectors),
                  len(entity_vectors[0]))

        self.input_size = len(entity_vectors[0])
        self.output_size = len(entity_classes[0])

        if fine_tune_weights_fn is not None:
            model_builder = self.fineTuneNetwork
            weights = []
            if from_ae:
                self.past_weights = []
                past_model_weights = []
                for p in past_model_weights_fn:
                    past_model_weights.append(
                        np.asarray(dt.import2dArray(p), dtype="float64"))
                past_model_bias = []
                for p in past_model_bias_fn:
                    past_model_bias.append(
                        np.asarray(dt.import1dArray(p, "f"), dtype="float64"))

                for p in range(len(past_model_weights)):
                    past_model_weights[p] = np.around(past_model_weights[p],
                                                      decimals=6)
                    past_model_bias[p] = np.around(past_model_bias[p],
                                                   decimals=6)

                for p in range(len(past_model_weights)):
                    self.past_weights.append([])
                    self.past_weights[p].append(past_model_weights[p])
                    self.past_weights[p].append(past_model_bias[p])
            for f in fine_tune_weights_fn:
                weights.extend(dt.import2dArray(f))

            r = np.asarray(weights, dtype="float64")
            r = np.asarray(weights, dtype="float64")

            for a in range(len(r)):
                r[a] = np.around(r[a], decimals=6)

            for a in range(len(entity_classes)):
                entity_classes[a] = np.around(entity_classes[a], decimals=6)

            self.fine_tune_weights = []
            self.fine_tune_weights.append(r.transpose())
            self.fine_tune_weights.append(
                np.zeros(shape=len(r), dtype="float64"))
        else:
            model_builder = self.classifierNetwork

        models = []
        x_train = []
        y_train = []
        x_test = []
        y_test = []
        x_dev = []
        y_dev = []
        train_x_c = []
        train_y_c = []

        c = 0
        for i in range(cv_splits):
            if split_to_use > -1:
                if c != split_to_use:
                    c += 1
                    continue

            models.append(model_builder())
            c += 1

        # Converting labels to categorical

        f1_scores = []
        accuracy_scores = []
        f1_averages = []
        accuracy_averages = []
        if cv_splits == 1:
            k_fold = KFold(n_splits=3, shuffle=False, random_state=None)
        else:
            k_fold = KFold(n_splits=cv_splits,
                           shuffle=False,
                           random_state=None)
        c = 0
        for train, test in k_fold.split(entity_vectors):
            if split_to_use > -1:
                if c != split_to_use:
                    c += 1
                    continue
            x_train.append(entity_vectors[train[:int(len(train) * 0.8)]])
            y_train.append(entity_classes[train[:int(len(train) * 0.8)]])
            x_test.append(entity_vectors[test])
            y_test.append(entity_classes[test])
            x_dev.append(entity_vectors[train[int(len(train) *
                                                  0.8):len(train)]])
            y_dev.append(entity_classes[train[int(len(train) *
                                                  0.8):len(train)]])

            train_x_c, train_y_c = entity_vectors[
                train[:int(len(train) *
                           0.8)]], entity_classes[train[:int(len(train) *
                                                             0.8)]]

            if fine_tune_weights_fn is not None:
                train_x_c = entity_vectors
                train_y_c = entity_classes
            hist = models[0].fit(train_x_c,
                                 train_y_c,
                                 nb_epoch=self.epochs,
                                 batch_size=self.batch_size,
                                 verbose=1,
                                 class_weight=class_weight)
            print(hist.history)
            c += 1
            if cv_splits == 1 or split_to_use == c:
                break
        if lock_weights_and_redo:
            print("REDO WITH LOCKED WEIGHTS")

            unlocked_model = Sequential()
            for l in range(0, len(models[0].layers) - 1):
                unlocked_model.add(models[0].layers[l])

            self.end_space = unlocked_model.predict(entity_vectors)
            total_file_name = loc + data_type + "/nnet/spaces/" + file_name
            dt.write2dArray(self.end_space,
                            total_file_name + "L" + str(l) + "LSPACE" + ".txt")
            unlocked_model.add(
                Dense(output_dim=finetune_size,
                      input_dim=self.hidden_layer_size,
                      activation="linear",
                      weights=self.fine_tune_weights))  #
            unlocked_model.compile(loss=self.loss, optimizer=self.optimizer)

            models[0] = unlocked_model
            hist = models[0].fit(train_x_c,
                                 train_y_c,
                                 nb_epoch=self.epochs,
                                 batch_size=self.batch_size,
                                 verbose=1,
                                 class_weight=class_weight)

        original_fn = file_name
        for m in range(len(models)):
            if development:
                x_test[m] = x_dev[m]
                y_test[m] = y_dev[m]

            if get_scores:

                vals_to_try = np.arange(start=cutoff_start, stop=1, step=0.01)
                test_pred = models[m].predict(x_train[m]).transpose()
                print(test_pred)
                y_train_m = np.asarray(y_train[m]).transpose()
                highest_f1 = [0] * len(test_pred)
                highest_vals = [0.2] * len(test_pred)

                if tune_vals:
                    for c in range(len(test_pred)):
                        for val in vals_to_try:
                            test_pred_c = np.copy(test_pred[c])
                            test_pred_c[test_pred_c >= val] = 1
                            test_pred_c[test_pred_c < val] = 0
                            acc = accuracy_score(y_train_m[c], test_pred_c)
                            f1 = f1_score(y_train_m[c],
                                          test_pred_c,
                                          average="binary")
                            f1 = (f1 + acc) / 2
                            if f1 > highest_f1[c]:
                                highest_f1[c] = f1
                                highest_vals[c] = val
                print("optimal f1s", highest_f1)
                print("optimal vals", highest_vals)
                y_pred = models[m].predict(x_test[m]).transpose()
                y_test[m] = np.asarray(y_test[m]).transpose()
                for y in range(len(y_pred)):
                    y_pred[y][y_pred[y] >= highest_vals[y]] = 1
                    y_pred[y][y_pred[y] < highest_vals[y]] = 0
                f1_array = []
                accuracy_array = []
                for y in range(len(y_pred)):
                    accuracy_array.append(
                        accuracy_score(y_test[m][y], y_pred[y]))
                    f1_array.append(
                        f1_score(y_test[m][y], y_pred[y], average="binary"))
                    print(f1_array[y])
                y_pred = y_pred.transpose()
                y_test[m] = np.asarray(y_test[m]).transpose()
                micro_average = f1_score(y_test[m], y_pred, average="micro")
                cv_f1_fn = loc + data_type + "/nnet/scores/F1 " + file_name + ".txt"
                cv_acc_fn = loc + data_type + "/nnet/scores/ACC " + file_name + ".txt"
                dt.write1dArray(f1_array, cv_f1_fn)
                dt.write1dArray(accuracy_array, cv_acc_fn)
                f1_scores.append(f1_array)
                accuracy_scores.append(accuracy_array)
                f1_average = np.average(f1_array)
                accuracy_average = np.average(accuracy_array)
                f1_averages.append(f1_average)
                accuracy_averages.append(accuracy_average)
                print("Average F1 Binary", f1_average, "Acc", accuracy_average)
                print("Micro Average F1", micro_average)

                f1_array.append(f1_average)
                f1_array.append(micro_average)
                accuracy_array.append(accuracy_average)
                accuracy_array.append(0.0)

                scores = [accuracy_array, f1_array]

                csv_fn = loc + data_type + "/nnet/csv/" + csv_fn + ".csv"

                file_names = [file_name + "ACC", file_name + "F1"]
                label_names = dt.import1dArray(label_names_fn)
                if dt.fileExists(csv_fn):
                    print("File exists, writing to csv")
                    try:
                        dt.write_to_csv(csv_fn, file_names, scores)
                    except PermissionError:
                        print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE")
                        dt.write_to_csv(
                            csv_fn[:len(csv_fn) - 4] + str(random.random()) +
                            "FAIL.csv", [file_name], scores)
                else:
                    print("File does not exist, recreating csv")
                    key = []
                    for l in label_names:
                        key.append(l)
                    key.append("AVERAGE")
                    key.append("MICRO AVERAGE")
                    dt.write_csv(csv_fn, file_names, scores, key)

            if save_outputs:
                if limit_entities is False:
                    self.output_clusters = models[m].predict(nnet_vectors)
                else:
                    self.output_clusters = models[m].predict(entity_vectors)
                self.output_clusters = self.output_clusters.transpose()
                dt.write2dArray(self.output_clusters, rank_fn)

            for l in range(0, len(models[m].layers) - 1):
                if dropout_noise is not None and dropout_noise > 0.0:
                    if l % 2 == 1:
                        continue
                print("Writing", l, "layer")
                truncated_model = Sequential()
                for a in range(l + 1):
                    truncated_model.add(models[m].layers[a])
                truncated_model.compile(loss=self.loss, optimizer="sgd")
                if get_nnet_vectors_path is not None:
                    self.end_space = truncated_model.predict(nnet_vectors)
                else:
                    self.end_space = truncated_model.predict(entity_vectors)
                total_file_name = loc + data_type + "/nnet/spaces/" + file_name
                dt.write2dArray(self.end_space,
                                total_file_name + "L" + str(l) + ".txt")

            for l in range(len(models[m].layers)):
                try:
                    dt.write2dArray(
                        models[m].layers[l].get_weights()[0], loc + data_type +
                        "/nnet/weights/" + file_name + "L" + str(l) + ".txt")
                    dt.write1dArray(
                        models[m].layers[l].get_weights()[1], loc + data_type +
                        "/nnet/bias/" + file_name + "L" + str(l) + ".txt")
                except IndexError:
                    print("Layer ", str(l), "Failed")

        if cv_splits > 1:
            class_f1_averages = []
            class_accuracy_averages = []
            f1_scores = np.asarray(f1_scores).transpose()
            accuracy_scores = np.asarray(accuracy_scores).transpose()

            for c in range(len(f1_scores)):
                class_f1_averages.append(np.average(f1_scores[c]))
                class_accuracy_averages.append(np.average(accuracy_scores[c]))

            f1_fn = loc + data_type + "/nnet/scores/F1 " + file_name + ".txt"
            acc_fn = loc + data_type + "/nnet/scores/ACC " + file_name + ".txt"
            dt.write1dArray(class_f1_averages, f1_fn)
            dt.write1dArray(class_accuracy_averages, acc_fn)
            overall_f1_average = np.average(f1_averages)
            overall_accuracy_average = np.average(accuracy_averages)
예제 #19
0
    def __init__(self,
                 features_fn,
                 classes_fn,
                 class_names_fn,
                 cluster_names_fn,
                 filename,
                 max_depth=None,
                 balance=None,
                 criterion="entropy",
                 save_details=False,
                 data_type="movies",
                 cv_splits=5,
                 csv_fn="../data/temp/no_csv_provided.csv",
                 rewrite_files=True,
                 split_to_use=-1,
                 development=False,
                 limit_entities=False,
                 limited_label_fn=None,
                 vector_names_fn=None,
                 pruning=1,
                 save_results_so_far=False):

        vectors = np.asarray(dt.import2dArray(features_fn)).transpose()

        labels = np.asarray(dt.import2dArray(classes_fn, "i"))

        print("vectors", len(vectors), len(vectors[0]))
        print("labels", len(labels), len(labels[0]))
        print("vectors", len(vectors), len(vectors[0]))
        cluster_names = dt.import1dArray(cluster_names_fn)
        label_names = dt.import1dArray(class_names_fn)
        all_fns = []
        file_names = ['ACC J48' + filename, 'F1 J48' + filename]
        acc_fn = '../data/' + data_type + '/rules/tree_scores/' + file_names[
            0] + '.scores'
        f1_fn = '../data/' + data_type + '/rules/tree_scores/' + file_names[
            1] + '.scores'
        all_fns.append(acc_fn)
        all_fns.append(f1_fn)
        all_fns.append(csv_fn)

        print(dt.allFnsAlreadyExist(all_fns), rewrite_files)

        if dt.allFnsAlreadyExist(
                all_fns) and not rewrite_files or save_results_so_far:
            print("Skipping task", "Weka Tree")
            return
        else:
            print("Running task", "Weka Tree")

        for l in range(len(cluster_names)):
            cluster_names[l] = cluster_names[l].split()[0]
        """
        for l in range(len(label_names)):
            if label_names[l][:6] == "class-":
                label_names[l] = label_names[l][6:]
        """
        f1_array = []
        accuracy_array = []

        labels = labels.transpose()
        print("labels transposed")
        print("labels", len(labels), len(labels[0]))

        if limit_entities is False:
            vector_names = dt.import1dArray(vector_names_fn)
            limited_labels = dt.import1dArray(limited_label_fn)
            vectors = np.asarray(
                dt.match_entities(vectors, limited_labels, vector_names))

        all_y_test = []
        all_predictions = []
        for l in range(len(labels)):

            if balance:
                new_vectors, new_labels = dt.balanceClasses(vectors, labels[l])
            else:
                new_vectors = vectors
                new_labels = labels[l]
            # Select training data with cross validation

            ac_y_test = []
            ac_y_train = []
            ac_x_train = []
            ac_x_test = []
            ac_y_dev = []
            ac_x_dev = []
            cv_f1 = []
            cv_acc = []
            if cv_splits == 1:
                kf = KFold(n_splits=3, shuffle=False, random_state=None)
            else:
                kf = KFold(n_splits=cv_splits,
                           shuffle=False,
                           random_state=None)
            c = 0
            for train, test in kf.split(new_vectors):
                if split_to_use > -1:
                    if c != split_to_use:
                        c += 1
                        continue
                ac_y_test.append(new_labels[test])
                ac_y_train.append(new_labels[train[int(len(train) * 0.2):]])
                val = int(len(train) * 0.2)
                t_val = train[val:]
                nv_t_val = new_vectors[t_val]
                ac_x_train.append(nv_t_val)
                ac_x_test.append(new_vectors[test])
                ac_x_dev.append(new_vectors[train[:int(len(train) * 0.2)]])
                ac_y_dev.append(new_labels[train[:int(len(train) * 0.2)]])
                c += 1
                if cv_splits == 1:
                    break

            predictions = []
            rules = []

            if development:
                ac_x_test = np.copy(np.asarray(ac_x_dev))
                ac_y_test = np.copy(np.asarray(ac_y_dev))

            train_fn = "../data/" + data_type + "/weka/data/" + filename + "Train.txt"
            test_fn = "../data/" + data_type + "/weka/data/" + filename + "Test.txt"

            for splits in range(len(ac_y_test)):

                # Get the weka predictions
                dt.writeArff(ac_x_train[splits], [ac_y_train[splits]],
                             [label_names[splits]],
                             train_fn,
                             header=True)
                dt.writeArff(ac_x_test[splits], [ac_y_test[splits]],
                             [label_names[splits]],
                             test_fn,
                             header=True)
                prediction, rule = self.getWekaPredictions(
                    train_fn + label_names[splits] + ".arff",
                    test_fn + label_names[splits] + ".arff", save_details,
                    pruning)
                predictions.append(prediction)
                rules.append(rule)

            for i in range(len(predictions)):
                if len(predictions) == 1:
                    all_y_test.append(ac_y_test[i])
                    all_predictions.append(predictions[i])
                f1 = f1_score(ac_y_test[i], predictions[i], average="binary")
                accuracy = accuracy_score(ac_y_test[i], predictions[i])
                cv_f1.append(f1)
                cv_acc.append(accuracy)
                scores = [[label_names[l], "f1", f1, "accuracy", accuracy]]
                print(scores)

                # Export a tree for each label predicted by the clf, not sure if this is needed...
                if save_details:
                    data_fn = "../data/" + data_type + "/rules/weka_rules/" + label_names[
                        l] + " " + filename + ".txt"
                    class_names = [label_names[l], "NOT " + label_names[l]]
                    #self.get_code(clf, cluster_names, class_names, label_names[l] + " " + filename, data_type)
                    dt.write1dArray(rules[i].split("\n"), data_fn)
                    dot_file = dt.import1dArray(data_fn)
                    new_dot_file = []
                    for line in dot_file:
                        if "->" not in line and "label" in line and '"t ' not in line and '"f ' not in line:
                            line = line.split('"')
                            line[1] = '"' + cluster_names[int(line[1])] + '"'
                            line = "".join(line)
                        new_dot_file.append(line)
                    dt.write1dArray(new_dot_file, data_fn)
                    graph = pydot.graph_from_dot_file(data_fn)
                    graph.write_png("../data/" + data_type +
                                    "/rules/weka_images/" + label_names[l] +
                                    " " + filename + ".png")
            f1_array.append(np.average(np.asarray(cv_f1)))
            accuracy_array.append(np.average(np.asarray(cv_acc)))

        accuracy_array = np.asarray(accuracy_array)
        accuracy_average = np.average(accuracy_array)
        accuracy_array = accuracy_array.tolist()
        f1_array = np.asarray(f1_array)
        f1_average = np.average(f1_array)
        f1_array = f1_array.tolist()
        micro_average = f1_score(np.asarray(all_y_test),
                                 np.asarray(all_predictions),
                                 average="micro")

        print("Micro F1", micro_average)

        accuracy_array.append(accuracy_average)
        accuracy_array.append(0.0)

        f1_array.append(f1_average)
        f1_array.append(micro_average)

        scores = [accuracy_array, f1_array]

        dt.write1dArray(accuracy_array, acc_fn)
        dt.write1dArray(f1_array, f1_fn)

        print(csv_fn)
        if dt.fileExists(csv_fn):
            print("File exists, writing to csv")
            try:
                dt.write_to_csv(csv_fn, file_names, scores)
            except PermissionError:
                print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE")
                print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE")
                print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE")
                print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE")
                print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE")
                print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE")
                dt.write_to_csv(
                    csv_fn[:len(csv_fn) - 4] + str(random.random()) +
                    "FAIL.csv", file_names, scores)
        else:
            print("File does not exist, recreating csv")
            key = []
            for l in label_names:
                key.append(l)
            key.append("AVERAGE")
            key.append("MICRO AVERAGE")
            dt.write_csv(csv_fn, file_names, scores, key)
예제 #20
0
def main(corpus_fn, output_folder):
    corpus = fetch_20newsgroups(subset='all', shuffle=False, remove=("headers", "footers", "quotes")).data
    tokenized_corpus, text_corpus = spacyTokenizeLowercase(corpus)
    np.save("../data/raw/newsgroups/corpus.npy", tokenized_corpus)
    dt.write1dArray(text_corpus, "../data/raw/newsgroups/corpus_processed.txt")
예제 #21
0
    fid, _, cats = line.partition(' ')
    doc_index = fileid_mapping[fid]
    for c in cats.split():
        class_index = cat_names[c]
        class_all[doc_index][class_index] = 1
        new_class_all[doc_index][class_index] = 1
        print(fid, doc_index, c, class_index)

print(class_all.shape)

save_path = "../data/raw/reuters/"

np.save(save_path + "fileid_mapping.npy", fileid_mapping)
np.save(save_path + "category_name_mapping.npy", cat_names)
print("cats", len(np.unique(list(cat_names.keys()))))
dt.write1dArray(list(cat_names.keys()), save_path + "category_names.txt")

names = list(fileid_mapping.keys())
for i in range(len(names)):
    names[i] = "_".join(names[i].split("/"))

dt.write1dArray(names, save_path + "available_entities.txt")
print("names", len(np.unique(names)))

dt.write2dArray(class_all, save_path + "class-all.txt")
dt.write1dArray(docs, save_path + "corpus.txt")

print("docs", len(np.unique(docs)))

unique_docs, index = np.unique(docs, return_index=True)
예제 #22
0
def removeClass(folder_name):
    names = dt.getFns(folder_name)
    for name in names:
        if name[:12] == "class-class-":
            contents = dt.import1dArray(folder_name + name)
            dt.write1dArray(contents, folder_name + name[6:])
예제 #23
0
def everythingElse():
    np.random.seed(1337)
    # Get frequencies, PPMI's, classes. Everything needed for directions
    skip_top = 0
    lowest_amt = skip_top
    highest_amt = 0
    index_from = 2
    classification = "all"
    bigrams = True

    if bigrams is False:
        (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=highest_amt, skip_top=skip_top, index_from=index_from)
    else:
        (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=0, skip_top=0, index_from=index_from)

    train_len = len(x_train)
    test_len = len(y_train)

    vectors = np.concatenate((x_train, x_test), axis=0)
    classes = np.concatenate((y_train, y_test), axis=0)
        #vectors = x_train[:int(len(x_train) * 0.8)]
        #classes = y_train[:int(len(y_train) * 0.8)]


    word_to_id = imdb.get_word_index()
    word_to_id = {k:(v+index_from) for k,v in word_to_id.items()}
    word_to_id["<UNK>"] = 0
    word_to_id["<START>"] = 1
    word_to_id["<OOV>"] = 2
    id_to_word = {value:key for key,value in word_to_id.items()}

    word_vectors = np.empty(shape=(len(vectors)), dtype=np.object) # Have to recreate original word vectors
    for s in range(len(vectors)):
        word_sentence = []
        for w in range(len(vectors[s])):
            word_sentence.append(id_to_word[vectors[s][w]])
        word_vectors[s] = word_sentence

    import gensim.models.phrases

    phrases = gensim.models.Phrases(word_vectors)
    bigram = gensim.models.phrases.Phraser(phrases)
    phrase_vectors = [bigram[sentence] for sentence in word_vectors]

    from gensim import corpora
    dictionary = corpora.Dictionary(phrase_vectors)
    dictionary.filter_extremes(no_below=highest_amt)

    dfs_list = []
    words = []
    for i in range(len(dictionary.keys())):
        words.append(dictionary[i])
        dfs_list.append(dictionary.dfs[i])
    dt.write1dArray(words, "../data/sentiment/bow/names/" + str(lowest_amt) + "-" + str(highest_amt) + "-" + classification + ".txt")
    dt.write1dArray(dfs_list, "../data/sentiment/bow/frequency/global/" + str(lowest_amt) + "-" + str(highest_amt) + "-" + classification + ".txt")
    corpus = [dictionary.doc2bow(text) for text in phrase_vectors]

    all_fn = "../data/sentiment/bow/frequency/phrases/class-all-"+str(lowest_amt)+"-"+str(highest_amt)+"-" + classification
    all_fn_binary = "../data/sentiment/bow/binary/phrases/class-all-"+str(lowest_amt)+"-"+str(highest_amt)+"-" + classification

    import gensim.matutils

    corpus = gensim.matutils.corpus2csc(corpus)

    sp.save_npz(all_fn, corpus)

    """
    #all_fn = "../data/sentiment/bow/frequency/phrases/class-all-"+str(lowest_amt)+"-"+str(highest_amt)+"-" + classification
    #corpus = sp.load_npz(all_fn + ".npz")
    print("saving")
    
    ppmi = mt.convertPPMI( corpus)
    
    ppmi_sparse = sp.csr_matrix(ppmi)
    
    ppmi_fn = "../data/sentiment/bow/ppmi/class-all-"+str(lowest_amt)+"-"+str(highest_amt)+"-" + classification
    sp.save_npz(ppmi_fn, ppmi_sparse)
    """
    """
    for key,val in id_to_word.items():
        if val == "that":
            print(key)
    
    ids = np.asarray(list(id_to_word.keys()), dtype=np.int32)
    words = np.asarray(list(id_to_word.values()), dtype="str")
    
    sorted_ids = np.argsort(ids)
    complete_word_list = words[sorted_ids]
    
    word_list = complete_word_list[skip_top+3:highest_amt]
    
    new_word_list = []
    
    new_word_list.append("<UNK>")
    new_word_list.append("<START>")
    new_word_list.append("<OOV>")
    
    
    for i in range(skip_top):
        new_word_list.append("<OOV>")
    
    
    for w in word_list:
        new_word_list.append(w)
    """
    """
    for x in x_train:
        run = False
        for n in range(len(x)):
            if x[n] > highest_amt-1000:
                run = True
        if run:
            try:
                for id in x_train[0]:
                    print(id, end=' ')
                print("")
                for id in x_train[0]:
                    print(complete_word_list[id], end=' ')
                print("")
                for id in x_train[0]:
                    print(new_word_list[id], end=' ')
                print("")
            except KeyError:
                print("fail")
        break
    """
    """
    import codecs
    decoding_table = (
        '\x00'     #  0x00 -> NULL
        '\x01'     #  0x01 -> START OF HEADING
        '\x02'     #  0x02 -> START OF TEXT
        '\x03'     #  0x03 -> END OF TEXT
        '\x04'     #  0x04 -> END OF TRANSMISSION
        '\x05'     #  0x05 -> ENQUIRY
        '\x06'     #  0x06 -> ACKNOWLEDGE
        '\x07'     #  0x07 -> BELL
        '\x08'     #  0x08 -> BACKSPACE
        '\t'       #  0x09 -> HORIZONTAL TABULATION
        '\n'       #  0x0A -> LINE FEED
        '\x0b'     #  0x0B -> VERTICAL TABULATION
        '\x0c'     #  0x0C -> FORM FEED
        '\r'       #  0x0D -> CARRIAGE RETURN
        '\x0e'     #  0x0E -> SHIFT OUT
        '\x0f'     #  0x0F -> SHIFT IN
        '\x10'     #  0x10 -> DATA LINK ESCAPE
        '\x11'     #  0x11 -> DEVICE CONTROL ONE
        '\x12'     #  0x12 -> DEVICE CONTROL TWO
        '\x13'     #  0x13 -> DEVICE CONTROL THREE
        '\x14'     #  0x14 -> DEVICE CONTROL FOUR
        '\x15'     #  0x15 -> NEGATIVE ACKNOWLEDGE
        '\x16'     #  0x16 -> SYNCHRONOUS IDLE
        '\x17'     #  0x17 -> END OF TRANSMISSION BLOCK
        '\x18'     #  0x18 -> CANCEL
        '\x19'     #  0x19 -> END OF MEDIUM
        '\x1a'     #  0x1A -> SUBSTITUTE
        '\x1b'     #  0x1B -> ESCAPE
        '\x1c'     #  0x1C -> FILE SEPARATOR
        '\x1d'     #  0x1D -> GROUP SEPARATOR
        '\x1e'     #  0x1E -> RECORD SEPARATOR
        '\x1f'     #  0x1F -> UNIT SEPARATOR
        ' '        #  0x20 -> SPACE
        '!'        #  0x21 -> EXCLAMATION MARK
        '"'        #  0x22 -> QUOTATION MARK
        '#'        #  0x23 -> NUMBER SIGN
        '$'        #  0x24 -> DOLLAR SIGN
        '%'        #  0x25 -> PERCENT SIGN
        '&'        #  0x26 -> AMPERSAND
        "'"        #  0x27 -> APOSTROPHE
        '('        #  0x28 -> LEFT PARENTHESIS
        ')'        #  0x29 -> RIGHT PARENTHESIS
        '*'        #  0x2A -> ASTERISK
        '+'        #  0x2B -> PLUS SIGN
        ','        #  0x2C -> COMMA
        '-'        #  0x2D -> HYPHEN-MINUS
        '.'        #  0x2E -> FULL STOP
        '/'        #  0x2F -> SOLIDUS
        '0'        #  0x30 -> DIGIT ZERO
        '1'        #  0x31 -> DIGIT ONE
        '2'        #  0x32 -> DIGIT TWO
        '3'        #  0x33 -> DIGIT THREE
        '4'        #  0x34 -> DIGIT FOUR
        '5'        #  0x35 -> DIGIT FIVE
        '6'        #  0x36 -> DIGIT SIX
        '7'        #  0x37 -> DIGIT SEVEN
        '8'        #  0x38 -> DIGIT EIGHT
        '9'        #  0x39 -> DIGIT NINE
        ':'        #  0x3A -> COLON
        ';'        #  0x3B -> SEMICOLON
        '<'        #  0x3C -> LESS-THAN SIGN
        '='        #  0x3D -> EQUALS SIGN
        '>'        #  0x3E -> GREATER-THAN SIGN
        '?'        #  0x3F -> QUESTION MARK
        '@'        #  0x40 -> COMMERCIAL AT
        'A'        #  0x41 -> LATIN CAPITAL LETTER A
        'B'        #  0x42 -> LATIN CAPITAL LETTER B
        'C'        #  0x43 -> LATIN CAPITAL LETTER C
        'D'        #  0x44 -> LATIN CAPITAL LETTER D
        'E'        #  0x45 -> LATIN CAPITAL LETTER E
        'F'        #  0x46 -> LATIN CAPITAL LETTER F
        'G'        #  0x47 -> LATIN CAPITAL LETTER G
        'H'        #  0x48 -> LATIN CAPITAL LETTER H
        'I'        #  0x49 -> LATIN CAPITAL LETTER I
        'J'        #  0x4A -> LATIN CAPITAL LETTER J
        'K'        #  0x4B -> LATIN CAPITAL LETTER K
        'L'        #  0x4C -> LATIN CAPITAL LETTER L
        'M'        #  0x4D -> LATIN CAPITAL LETTER M
        'N'        #  0x4E -> LATIN CAPITAL LETTER N
        'O'        #  0x4F -> LATIN CAPITAL LETTER O
        'P'        #  0x50 -> LATIN CAPITAL LETTER P
        'Q'        #  0x51 -> LATIN CAPITAL LETTER Q
        'R'        #  0x52 -> LATIN CAPITAL LETTER R
        'S'        #  0x53 -> LATIN CAPITAL LETTER S
        'T'        #  0x54 -> LATIN CAPITAL LETTER T
        'U'        #  0x55 -> LATIN CAPITAL LETTER U
        'V'        #  0x56 -> LATIN CAPITAL LETTER V
        'W'        #  0x57 -> LATIN CAPITAL LETTER W
        'X'        #  0x58 -> LATIN CAPITAL LETTER X
        'Y'        #  0x59 -> LATIN CAPITAL LETTER Y
        'Z'        #  0x5A -> LATIN CAPITAL LETTER Z
        '['        #  0x5B -> LEFT SQUARE BRACKET
        '\\'       #  0x5C -> REVERSE SOLIDUS
        ']'        #  0x5D -> RIGHT SQUARE BRACKET
        '^'        #  0x5E -> CIRCUMFLEX ACCENT
        '_'        #  0x5F -> LOW LINE
        '`'        #  0x60 -> GRAVE ACCENT
        'a'        #  0x61 -> LATIN SMALL LETTER A
        'b'        #  0x62 -> LATIN SMALL LETTER B
        'c'        #  0x63 -> LATIN SMALL LETTER C
        'd'        #  0x64 -> LATIN SMALL LETTER D
        'e'        #  0x65 -> LATIN SMALL LETTER E
        'f'        #  0x66 -> LATIN SMALL LETTER F
        'g'        #  0x67 -> LATIN SMALL LETTER G
        'h'        #  0x68 -> LATIN SMALL LETTER H
        'i'        #  0x69 -> LATIN SMALL LETTER I
        'j'        #  0x6A -> LATIN SMALL LETTER J
        'k'        #  0x6B -> LATIN SMALL LETTER K
        'l'        #  0x6C -> LATIN SMALL LETTER L
        'm'        #  0x6D -> LATIN SMALL LETTER M
        'n'        #  0x6E -> LATIN SMALL LETTER N
        'o'        #  0x6F -> LATIN SMALL LETTER O
        'p'        #  0x70 -> LATIN SMALL LETTER P
        'q'        #  0x71 -> LATIN SMALL LETTER Q
        'r'        #  0x72 -> LATIN SMALL LETTER R
        's'        #  0x73 -> LATIN SMALL LETTER S
        't'        #  0x74 -> LATIN SMALL LETTER T
        'u'        #  0x75 -> LATIN SMALL LETTER U
        'v'        #  0x76 -> LATIN SMALL LETTER V
        'w'        #  0x77 -> LATIN SMALL LETTER W
        'x'        #  0x78 -> LATIN SMALL LETTER X
        'y'        #  0x79 -> LATIN SMALL LETTER Y
        'z'        #  0x7A -> LATIN SMALL LETTER Z
        '{'        #  0x7B -> LEFT CURLY BRACKET
        '|'        #  0x7C -> VERTICAL LINE
        '}'        #  0x7D -> RIGHT CURLY BRACKET
        '~'        #  0x7E -> TILDE
        '\x7f'     #  0x7F -> DELETE
        '\u20ac'   #  0x80 -> EURO SIGN
        '\ufffe'   #  0x81 -> UNDEFINED
        '\u201a'   #  0x82 -> SINGLE LOW-9 QUOTATION MARK
        '\u0192'   #  0x83 -> LATIN SMALL LETTER F WITH HOOK
        '\u201e'   #  0x84 -> DOUBLE LOW-9 QUOTATION MARK
        '\u2026'   #  0x85 -> HORIZONTAL ELLIPSIS
        '\u2020'   #  0x86 -> DAGGER
        '\u2021'   #  0x87 -> DOUBLE DAGGER
        '\u02c6'   #  0x88 -> MODIFIER LETTER CIRCUMFLEX ACCENT
        '\u2030'   #  0x89 -> PER MILLE SIGN
        '\u0160'   #  0x8A -> LATIN CAPITAL LETTER S WITH CARON
        '\u2039'   #  0x8B -> SINGLE LEFT-POINTING ANGLE QUOTATION MARK
        '\u0152'   #  0x8C -> LATIN CAPITAL LIGATURE OE
        '\ufffe'   #  0x8D -> UNDEFINED
        '\u017d'   #  0x8E -> LATIN CAPITAL LETTER Z WITH CARON
        '\ufffe'   #  0x8F -> UNDEFINED
        '\ufffe'   #  0x90 -> UNDEFINED
        '\u2018'   #  0x91 -> LEFT SINGLE QUOTATION MARK
        '\u2019'   #  0x92 -> RIGHT SINGLE QUOTATION MARK
        '\u201c'   #  0x93 -> LEFT DOUBLE QUOTATION MARK
        '\u201d'   #  0x94 -> RIGHT DOUBLE QUOTATION MARK
        '\u2022'   #  0x95 -> BULLET
        '\u2013'   #  0x96 -> EN DASH
        '\u2014'   #  0x97 -> EM DASH
        '\u02dc'   #  0x98 -> SMALL TILDE
        '\u2122'   #  0x99 -> TRADE MARK SIGN
        '\u0161'   #  0x9A -> LATIN SMALL LETTER S WITH CARON
        '\u203a'   #  0x9B -> SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
        '\u0153'   #  0x9C -> LATIN SMALL LIGATURE OE
        '\ufffe'   #  0x9D -> UNDEFINED
        '\u017e'   #  0x9E -> LATIN SMALL LETTER Z WITH CARON
        '\u0178'   #  0x9F -> LATIN CAPITAL LETTER Y WITH DIAERESIS
        '\xa0'     #  0xA0 -> NO-BREAK SPACE
        '\xa1'     #  0xA1 -> INVERTED EXCLAMATION MARK
        '\xa2'     #  0xA2 -> CENT SIGN
        '\xa3'     #  0xA3 -> POUND SIGN
        '\xa4'     #  0xA4 -> CURRENCY SIGN
        '\xa5'     #  0xA5 -> YEN SIGN
        '\xa6'     #  0xA6 -> BROKEN BAR
        '\xa7'     #  0xA7 -> SECTION SIGN
        '\xa8'     #  0xA8 -> DIAERESIS
        '\xa9'     #  0xA9 -> COPYRIGHT SIGN
        '\xaa'     #  0xAA -> FEMININE ORDINAL INDICATOR
        '\xab'     #  0xAB -> LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
        '\xac'     #  0xAC -> NOT SIGN
        '\xad'     #  0xAD -> SOFT HYPHEN
        '\xae'     #  0xAE -> REGISTEREvectorsD SIGN
        '\xaf'     #  0xAF -> MACRON
        '\xb0'     #  0xB0 -> DEGREE SIGN
        '\xb1'     #  0xB1 -> PLUS-MINUS SIGN
        '\xb2'     #  0xB2 -> SUPERSCRIPT TWO
        '\xb3'     #  0xB3 -> SUPERSCRIPT THREE
        '\xb4'     #  0xB4 -> ACUTE ACCENT
        '\xb5'     #  0xB5 -> MICRO SIGN
        '\xb6'     #  0xB6 -> PILCROW SIGN
        '\xb7'     #  0xB7 -> MIDDLE DOT
        '\xb8'     #  0xB8 -> CEDILLA
        '\xb9'     #  0xB9 -> SUPERSCRIPT ONE
        '\xba'     #  0xBA -> MASCULINE ORDINAL INDICATOR
        '\xbb'     #  0xBB -> RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
        '\xbc'     #  0xBC -> VULGAR FRACTION ONE QUARTER
        '\xbd'     #  0xBD -> VULGAR FRACTION ONE HALF
        '\xbe'     #  0xBE -> VULGAR FRACTION THREE QUARTERS
        '\xbf'     #  0xBF -> INVERTED QUESTION MARK
        '\xc0'     #  0xC0 -> LATIN CAPITAL LETTER A WITH GRAVE
        '\xc1'     #  0xC1 -> LATIN CAPITAL LETTER A WITH ACUTE
        '\xc2'     #  0xC2 -> LATIN CAPITAL LETTER A WITH CIRCUMFLEX
        '\xc3'     #  0xC3 -> LATIN CAPITAL LETTER A WITH TILDE
        '\xc4'     #  0xC4 -> LATIN CAPITAL LETTER A WITH DIAERESIS
        '\xc5'     #  0xC5 -> LATIN CAPITAL LETTER A WITH RING ABOVE
        '\xc6'     #  0xC6 -> LATIN CAPITAL LETTER AE
        '\xc7'     #  0xC7 -> LATIN CAPITAL LETTER C WITH CEDILLA
        '\xc8'     #  0xC8 -> LATIN CAPITAL LETTER E WITH GRAVE
        '\xc9'     #  0xC9 -> LATIN CAPITAL LETTER E WITH ACUTE
        '\xca'     #  0xCA -> LATIN CAPITAL LETTER E WITH CIRCUMFLEX
        '\xcb'     #  0xCB -> LATIN CAPITAL LETTER E WITH DIAERESIS
        '\xcc'     #  0xCC -> LATIN CAPITAL LETTER I WITH GRAVE
        '\xcd'     #  0xCD -> LATIN CAPITAL LETTER I WITH ACUTE
        '\xce'     #  0xCE -> LATIN CAPITAL LETTER I WITH CIRCUMFLEX
        '\xcf'     #  0xCF -> LATIN CAPITAL LETTER I WITH DIAERESIS
        '\xd0'     #  0xD0 -> LATIN CAPITAL LETTER ETH
        '\xd1'     #  0xD1 -> LATIN CAPITAL LETTER N WITH TILDE
        '\xd2'     #  0xD2 -> LATIN CAPITAL LETTER O WITH GRAVE
        '\xd3'     #  0xD3 -> LATIN CAPITAL LETTER O WITH ACUTE
        '\xd4'     #  0xD4 -> LATIN CAPITAL LETTER O WITH CIRCUMFLEX
        '\xd5'     #  0xD5 -> LATIN CAPITAL LETTER O WITH TILDE
        '\xd6'     #  0xD6 -> LATIN CAPITAL LETTER O WITH DIAERESIS
        '\xd7'     #  0xD7 -> MULTIPLICATION SIGN
        '\xd8'     #  0xD8 -> LATIN CAPITAL LETTER O WITH STROKE
        '\xd9'     #  0xD9 -> LATIN CAPITAL LETTER U WITH GRAVE
        '\xda'     #  0xDA -> LATIN CAPITAL LETTER U WITH ACUTE
        '\xdb'     #  0xDB -> LATIN CAPITAL LETTER U WITH CIRCUMFLEX
        '\xdc'     #  0xDC -> LATIN CAPITAL LETTER U WITH DIAERESIS
        '\xdd'     #  0xDD -> LATIN CAPITAL LETTER Y WITH ACUTE
        '\xde'     #  0xDE -> LATIN CAPITAL LETTER THORN
        '\xdf'     #  0xDF -> LATIN SMALL LETTER SHARP S
        '\xe0'     #  0xE0 -> LATIN SMALL LETTER A WITH GRAVE
        '\xe1'     #  0xE1 -> LATIN SMALL LETTER A WITH ACUTE
        '\xe2'     #  0xE2 -> LATIN SMALL LETTER A WITH CIRCUMFLEX
        '\xe3'     #  0xE3 -> LATIN SMALL LETTER A WITH TILDE
        '\xe4'     #  0xE4 -> LATIN SMALL LETTER A WITH DIAERESIS
        '\xe5'     #  0xE5 -> LATIN SMALL LETTER A WITH RING ABOVE
        '\xe6'     #  0xE6 -> LATIN SMALL LETTER AE
        '\xe7'     #  0xE7 -> LATIN SMALL LETTER C WITH CEDILLA
        '\xe8'     #  0xE8 -> LATIN SMALL LETTER E WITH GRAVE
        '\xe9'     #  0xE9 -> LATIN SMALL LETTER E WITH ACUTE
        '\xea'     #  0xEA -> LATIN SMALL LETTER E WITH CIRCUMFLEX
        '\xeb'     #  0xEB -> LATIN SMALL LETTER E WITH DIAERESIS
        '\xec'     #  0xEC -> LATIN SMALL LETTER I WITH GRAVE
        '\xed'     #  0xED -> LATIN SMALL LETTER I WITH ACUTE
        '\xee'     #  0xEE -> LATIN SMALL LETTER I WITH CIRCUMFLEX
        '\xef'     #  0xEF -> LATIN SMALL LETTER I WITH DIAERESIS
        '\xf0'     #  0xF0 -> LATIN SMALL LETTER ETH
        '\xf1'     #  0xF1 -> LATIN SMALL LETTER N WITH TILDE
        '\xf2'     #  0xF2 -> LATIN SMALL LETTER O WITH GRAVE
        '\xf3'     #  0xF3 -> LATIN SMALL LETTER O WITH ACUTE
        '\xf4'     #  0xF4 -> LATIN SMALL LETTER O WITH CIRCUMFLEX
        '\xf5'     #  0xF5 -> LATIN SMALL LETTER O WITH TILDE
        '\xf6'     #  0xF6 -> LATIN SMALL LETTER O WITH DIAERESIS
        '\xf7'     #  0xF7 -> DIVISION SIGN
        '\xf8'     #  0xF8 -> LATIN SMALL LETTER O WITH STROKE
        '\xf9'     #  0xF9 -> LATIN SMALL LETTER U WITH GRAVE
        '\xfa'     #  0xFA -> LATIN SMALL LETTER U WITH ACUTE
        '\xfb'     #  0xFB -> LATIN SMALL LETTER U WITH CIRCUMFLEX
        '\xfc'     #  0xFC -> LATIN SMALL LETTER U WITH DIAERESIS
        '\xfd'     #  0xFD -> LATIN SMALL LETTER Y WITH ACUTE
        '\xfe'     #  0xFE -> LATIN SMALL LETTER THORN
        '\xff'     #  0xFF -> LATIN SMALL LETTER Y WITH DIAERESIS
    )
    
    
    import collections
    
    encoding_table=codecs.charmap_build(decoding_table)
    mapped_ids = collections.OrderedDict()
    del_ind = []
    amt_of_encs = 0
    for w in range(len(word_list)):
        try:
            for char in word_list[w]:
                codecs.charmap_encode(char,"strict", encoding_table)
        except UnicodeEncodeError:
            amt_of_encs += 1
            del_ind.append(w)
            print(word_list[w], w)
        mapped_ids[skip_top+3+w] = w - amt_of_encs
    
    word_list = np.delete(word_list, del_ind)
    print("----------------------------------------")
    
    dt.write1dArray(word_list, "../data/sentiment/bow/names/" + str(lowest_amt) + "-" + str(highest_amt) + "-" + classification + ".txt")
    
    all_fn = "../data/sentiment/bow/frequency/phrases/class-all-"+str(lowest_amt)+"-"+str(highest_amt)+"-" + classification
    all_fn_binary = "../data/sentiment/bow/binary/phrases/class-all-"+str(lowest_amt)+"-"+str(highest_amt)+"-" + classification
    
    tf = np.zeros(shape=(len(vectors), len(word_list)), dtype=np.int32)
    tf_binary = np.zeros(shape=(len(vectors), len(word_list)), dtype=np.int32)
    
    for ds in range(len(vectors)): # d for document sequence
        for wi in range(len(vectors[ds])): # every word id in the sequence
            if vectors[ds][wi] >= skip_top+ 3:
                new_id = mapped_ids[vectors[ds][wi]]
                print(ds, new_id)
                tf[ds][new_id] += 1
                tf_binary[ds][new_id] = 1
    print("transposing")
    tf = np.asarray(tf, dtype="int").transpose()
    tf_binary = np.asarray(tf_binary, dtype="int").transpose()
    
    tf_sparse = sp.csr_matrix(tf)
    tf_binary_sparse = sp.csr_matrix(tf_binary)
    
    sp.save_npz(all_fn_binary, tf_binary_sparse)
    sp.save_npz(all_fn, tf_sparse)
    
    print("saving")
    
    #mt.printIndividualFromAll("sentiment",  "frequency/phrases", lowest_amt, highest_amt, classification, all_fn=all_fn, names_array=word_list)
    #mt.printIndividualFromAll("sentiment",  "binary/phrases", lowest_amt, highest_amt, classification, all_fn=all_fn_binary, names_array=word_list)
    
    ppmi_fn = "../data/sentiment/bow/ppmi/class-all-"+str(lowest_amt)+"-"+str(highest_amt)+"-" + classification
    #if dt.fileExists(ppmi_fn) is False:
    
    ppmi = mt.convertPPMI( tf_sparse)
    
    ppmi_sparse = sp.csr_matrix(ppmi)
    
    sp.save_npz(ppmi_fn, ppmi_sparse)
    dt.write2dArray(ppmi, ppmi_fn)
    #mt.printIndividualFromAll("sentiment",  "ppmi", lowest_amt, highest_amt, classification, all_fn=all_fn, names_array=word_list)
    dt.write2dArray(tf_binary, all_fn_binary)
    dt.write2dArray(tf, all_fn)
    
    """

    print("1")
    classes = np.asarray(classes, dtype=np.int32)
    print(2)
    print(3)
    print(4)
    names = ["sentiment"]
    dt.write1dArray(names, "../data/sentiment/classify/sentiment/names.txt")
    dt.write1dArray(classes, "../data/sentiment/classify/sentiment/class-" + "sentiment")
    dt.write1dArray(classes,"../data/sentiment/classify/sentiment/class-all")
    dt.write1dArray(list(range(len(classes))), "../data/sentiment/classify/sentiment/available_entities.txt")
예제 #24
0
def parseTree(tree_fn, output_fn, entity_names_fn):
    data_type = "placetypes"
    class_name = "opencyc"
    entity_names = dt.import1dArray(entity_names_fn)
    with open(tree_fn, "r") as infile:
        tree = [line for line in infile]
    tree = tree[1:]
    indexes_to_delete = []
    for l in range(len(tree)):
        tree[l] = re.sub(r'\s\*', ' ', tree[l])
        if "DELETE" in tree[l]:
            indexes_to_delete.append(l)

    tree = np.delete(tree, indexes_to_delete)
    entities_classes = {}

    for l in range(len(tree)):
        removed_asterisk = re.sub(r'\*', ' ', tree[l])
        stripped = removed_asterisk.strip()
        entities_classes[stripped] = []

    classes = []
    current_tabs = 0
    current_tabs_index = 0
    current_tab_class = []

    class_names = []
    next_index = 0
    for l in range(len(tree)-1):
        removed_asterisk = re.sub(r'\*', ' ', tree[l])
        entity = removed_asterisk.strip()

        tabs = len(tree[l]) - len(tree[l].strip())
        next_tabs = len(tree[l+1]) - len(tree[l+1].strip())
        print("TRY", entity, tabs, next_tabs)
        # If the tree has a subclass
        if (next_tabs) > tabs and tabs <= 4:
            print("START", entity, tabs, next_tabs)
            for j in range(l+1, len(tree)):
                inner_tabs = len(tree[j]) - len(tree[j].strip())
                removed_asterisk = re.sub(r'\*', ' ', tree[j])
                inner_entity = removed_asterisk.strip()
                print("ADD", inner_entity)
                if inner_tabs <= tabs:
                    print("END", inner_tabs, tabs)
                    break
                else:
                    entities_classes[entity].append(inner_entity)
                    print("found", inner_entity, "added to", entity)

    found_entities = []
    found_arrays = []
    class_names = []
    for key, value in list(entities_classes.items()):
        if len(value) < 30:
            del entities_classes[key]
            continue
        """ Removing entities that aren't in a list
        found = False
        for e in entity_names:
            if key == e:
                found = True
        if not found:
            del entities_classes[key]
            continue
        """
        for v in value:
            found_entities.append(v)
        found_arrays.append(value)
        class_names.append(key)
    found_entities = np.unique(np.asarray(found_entities))
    dt.write1dArray(found_entities, "../data/"+data_type+"/classify/"+class_name+"/available_entities.txt")

    # Sort keys and values
    index = np.argsort(class_names)
    sorted_class_names = []
    sorted_value_names = []
    for i in index:
        sorted_class_names.append(class_names[i])
        sorted_value_names.append(found_arrays[i])
    value_indexes = []
    # Convert values to indexes
    for v in range(len(sorted_vaentity_name_fnlue_names)):
        value_index = []
        for g in range(len(sorted_value_names[v])):
            for e in range(len(found_entities)):
                if sorted_value_names[v][g] == found_entities[e]:
                    value_index.append(e)
        value_indexes.append(value_index)

    matrix = np.asarray([[0]* len(entities_classes)]*len(found_entities))
    for c in range(len(sorted_class_names)):
        print(c)
        print("-------------------")
        for v in value_indexes[c]:
            print(v)
            matrix[v, c] = 1
        dt.write1dArray(matrix[c], "../data/placetypes/classify/opencyc/class-" + sorted_class_names[c])

    matrix = np.asarray(matrix)

    dt.write2dArray(matrix, "../data/placetypes/classify/opencyc/class-all")
예제 #25
0
    def __init__(self,
                 vector_path,
                 class_path,
                 property_names_fn,
                 file_name,
                 svm_type,
                 training_size=10000,
                 lowest_count=200,
                 highest_count=21470000,
                 get_kappa=True,
                 get_f1=True,
                 single_class=True,
                 data_type="movies",
                 getting_directions=True,
                 threads=1,
                 chunk_amt=0,
                 chunk_id=0,
                 rewrite_files=False,
                 classification="all",
                 loc="../data/",
                 logistic_regression=False,
                 sparse_array_fn=None,
                 only_these_fn=None):

        self.get_kappa = True
        self.get_f1 = get_f1
        self.data_type = data_type
        self.classification = classification
        self.lowest_amt = lowest_count
        self.higher_amt = highest_count

        if chunk_amt > 0:
            file_name = file_name + " CID" + str(chunk_id) + " CAMT" + str(
                chunk_amt)

        directions_fn = loc + data_type + "/svm/directions/" + file_name + ".txt"
        ktau_scores_fn = loc + data_type + "/svm/f1/" + file_name + ".txt"
        kappa_fn = loc + data_type + "/svm/kappa/" + file_name + ".txt"
        acc_fn = loc + data_type + "/svm/acc/" + file_name + ".txt"
        TP_fn = loc + data_type + "/svm/stats/TP " + file_name + ".txt"
        FP_fn = loc + data_type + "/svm/stats/FP " + file_name + ".txt"
        TN_fn = loc + data_type + "/svm/stats/TN " + file_name + ".txt"
        FN_fn = loc + data_type + "/svm/stats/FN " + file_name + ".txt"

        all_fns = [directions_fn, kappa_fn]
        if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
            print("Skipping task", "getSVMResults")
            return
        else:
            print("Running task", "getSVMResults")

        y_train = 0
        y_test = 0
        vectors = np.asarray(dt.import2dArray(vector_path))
        print("imported vectors")
        if not getting_directions:
            classes = np.asarray(dt.import2dArray(class_path))
            print("imported classes")

        property_names = dt.import1dArray(property_names_fn)
        print("imported propery names")
        if chunk_amt > 0:
            if chunk_id == chunk_amt - 1:
                chunk = int(len(property_names) / chunk_amt)
                multiply = chunk_amt - 1
                property_names = property_names[chunk * multiply:]
            else:
                property_names = dt.chunks(
                    property_names, int(
                        (len(property_names) / chunk_amt)))[chunk_id]

        if sparse_array_fn is not None:
            sparse_array = dt.import2dArray(sparse_array_fn)
        else:
            sparse_array = None

        if sparse_array is not None:
            for s in range(len(sparse_array)):
                if len(np.nonzero(sparse_array[s])[0]) <= 1:
                    print("WILL FAIL", s, len(np.nonzero(sparse_array[s])[0]))
                else:
                    print(len(np.nonzero(sparse_array[s])[0]))

        if not getting_directions:
            x_train, x_test, y_train, y_test = train_test_split(vectors,
                                                                classes,
                                                                test_size=0.3,
                                                                random_state=0)
        else:
            x_train = vectors
            x_test = vectors

        if get_f1:
            y_train = y_train.transpose()
            y_test = y_test.transpose()
            print("transpoosed")
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test

        if only_these_fn is not None:
            only_these = dt.import1dArray(only_these_fn, "s")
            inds = []
            for s in range(len(property_names)):
                for o in only_these:
                    if property_names[s] == o:
                        inds.append(s)
                        break
            sparse_array = sparse_array[inds]
            property_names = property_names[inds]

        if self.get_f1 is False:
            print("running svms")
            kappa_scores, directions, f1_scores, property_names, accs, TPs, FPs, TNs, FNs = self.runAllSVMs(
                y_test, y_train, property_names, file_name, svm_type,
                getting_directions, threads, logistic_regression, sparse_array)

            dt.write1dArray(kappa_scores, kappa_fn)
            dt.write2dArray(directions, directions_fn)
            dt.write1dArray(f1_scores, ktau_scores_fn)
            dt.write1dArray(accs, acc_fn)
            dt.write1dArray(TPs, TP_fn)
            dt.write1dArray(FPs, FP_fn)
            dt.write1dArray(TNs, TN_fn)
            dt.write1dArray(FNs, FN_fn)
            dt.write1dArray(property_names,
                            property_names_fn + file_name + ".txt")
        else:
            final_f1 = []
            final_acc = []
            for y in range(len(y_train)):
                f1, acc = self.runClassifySVM(y_test[y], y_train[y])
                print(f1, acc)
                final_f1.append(f1)
                final_acc.append(acc)
            dt.write1dArray(final_f1, ktau_scores_fn)
            dt.write1dArray(final_acc, acc_fn)
예제 #26
0
def main(data_type, output_folder, grams, no_below, no_above, bowmin):
    if data_type == "newsgroups":
        newsgroups = fetch_20newsgroups(subset='all',
                                        shuffle=False,
                                        remove=("headers", "footers",
                                                "quotes"))
        corpus = newsgroups.data
        classes = newsgroups.target
        encoding_type = "utf8"
    elif data_type == "sentiment":
        (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=0,
                                                              skip_top=0,
                                                              index_from=0,
                                                              seed=113)
        corpus = np.concatenate((x_train, x_test), axis=0)
        classes = np.concatenate((y_train, y_test), axis=0)
        corpus = makeCorpusFromIds(corpus, imdb.get_word_index())
        encoding_type = "utf8"
    else:
        corpus = dt.import1dArray(output_folder + "duplicate_removed_docs.txt")
        classes = dt.import2dArray(
            output_folder + "duplicate_removed_classes.txt", "i")
        encoding_type = "utf8"

    file_name = "simple_numeric"
    processed_corpus = preprocess(corpus)
    tokenized_corpus = naiveTokenizer(processed_corpus)
    vocab, dct, id2token = getVocab(tokenized_corpus)
    processed_corpus, tokenized_corpus, remove_ind, classes = removeEmpty(
        processed_corpus, tokenized_corpus, classes)
    bow, bow_vocab = doc2bow(tokenized_corpus, dct, bowmin)
    print(bowmin, len(list(bow_vocab.keys())), "|||", bow.shape)
    filtered_bow, word_list, filtered_vocab = filterBow(
        tokenized_corpus, dct, no_below, no_above)
    tokenized_ids = tokensToIds(tokenized_corpus, vocab)
    print(output_folder + file_name + "_remove.npy")
    np.save(output_folder + file_name + "_remove.npy", remove_ind)
    np.save(output_folder + file_name + "_corpus.npy", tokenized_corpus)
    np.save(output_folder + file_name + "_tokenized_corpus.npy", tokenized_ids)
    np.save(output_folder + file_name + "_vocab " + str(bowmin) + ".npy",
            bow_vocab)
    np.save(output_folder + file_name + "_filtered_vocab.npy", filtered_vocab)
    dt.write1dArray(processed_corpus,
                    output_folder + file_name + "_corpus_processed.txt",
                    encoding=encoding_type)
    np.save(output_folder + file_name + "_classes.npy", classes)
    if data_type != "reuters":
        np.save(output_folder + file_name + "_classes_categorical.npy",
                to_categorical(classes))
    sp.save_npz(output_folder + file_name + ".npz", bow)
    dt.write1dArray(word_list,
                    output_folder + file_name + "_words.txt",
                    encoding=encoding_type)
    dt.write1dArray(list(bow_vocab.keys()),
                    output_folder + file_name + "_all_words_2.txt",
                    encoding=encoding_type)
    """
    if grams > 0:
        for i in range(2, grams):  # Up to 5-length grams
            processed_corpus, tokenized_corpus = ngrams(tokenized_corpus)
            vocab, dct, id2token = getVocab(tokenized_corpus)
            bow = doc2bow(tokenized_corpus, dct, 100, 10)
            tokenized_ids = tokensToIds(tokenized_corpus, vocab)
            np.save(output_folder + file_name + "_corpus " + str(i) + "-gram" + ".npy", tokenized_corpus)
            np.save(output_folder + file_name + "_tokenized_corpus " + str(i) + "-gram" + ".npy", tokenized_ids)
            np.save(output_folder + file_name + "_vocab " + str(i) + "-gram" + ".npy", vocab)
            dt.write1dArray(processed_corpus, output_folder + file_name + "_corpus_processed " + str(i) + "-gram" + ".txt")
            sp.save_npz(output_folder + file_name + "_bow " + str(i) + "-gram" + ".npz", bow)
            dt.write1dArray(word_list, output_folder + file_name + "_words.txt")
    """

    file_name += "_stopwords"

    filtered_ppmi_fn = "../data/" + data_type + "/bow/ppmi/" + file_name + "_ppmi " + str(
        no_below) + "-" + str(no_above) + "-all.npz"
    ppmi_fn = "../data/" + data_type + "/bow/ppmi/" + file_name + "_ppmi " + str(
        bowmin) + "-all.npz"
    bow_fn = "../data/" + data_type + "/bow/frequency/phrases/" + file_name + "_bow " + str(
        bowmin) + "-all.npz"
    filtered_bow_fn = "../data/" + data_type + "/bow/frequency/phrases/" + file_name + "_bow " + str(
        no_below) + "-" + str(no_above) + "-all.npz"

    # Re-intialize so that we don't start with an already filtered corpus

    tokenized_corpus, processed_corpus = removeStopWords(tokenized_corpus)
    processed_corpus, tokenized_corpus, remove_ind, classes = removeEmpty(
        processed_corpus, tokenized_corpus, classes)
    vocab, dct, id2token = getVocab(tokenized_corpus)
    bow, bow_vocab = doc2bow(tokenized_corpus, dct, bowmin)
    print(bowmin, len(list(bow_vocab.keys())), "|||", bow.shape)
    filtered_bow, word_list, filtered_vocab = filterBow(
        tokenized_corpus, dct, no_below, no_above)
    tokenized_ids = tokensToIds(tokenized_corpus, vocab)

    print(output_folder + file_name + "_remove.npy")
    print(output_folder + file_name + "_corpus.npy")
    print(output_folder + file_name + "_tokenized_corpus.npy")
    print(output_folder + file_name + "_id2token.npy")
    print(output_folder + file_name + "_vocab " + str(bowmin) + ".npy")
    print(output_folder + file_name + "_corpus_processed.txt")
    print(output_folder + file_name + "_classes.npy")
    print(output_folder + file_name + "_classes_categorical.npy")
    np.save(output_folder + file_name + "id2token.npy", id2token)
    np.save(output_folder + file_name + "_remove.npy", remove_ind)
    np.save(output_folder + file_name + "_vocab " + str(bowmin) + ".npy",
            bow_vocab)
    np.save(output_folder + file_name + "_filtered_vocab.npy", filtered_vocab)
    np.save(output_folder + file_name + "_corpus.npy", tokenized_corpus)
    np.save(output_folder + file_name + "_tokenized_corpus.npy", tokenized_ids)
    dt.write1dArray(processed_corpus,
                    output_folder + file_name + "_corpus_processed.txt",
                    encoding=encoding_type)
    np.save(output_folder + file_name + "_classes.npy", classes)
    if data_type != "reuters":
        np.save(output_folder + file_name + "_classes_categorical.npy",
                to_categorical(classes))

    print("------------------- Saved most, moving to PPMI etc", file_name)

    print(bow_fn)
    print(filtered_bow_fn)
    sp.save_npz(bow_fn, bow)
    sp.save_npz(filtered_bow_fn, filtered_bow)

    dt.write1dArray(word_list,
                    "../data/" + data_type + "/bow/names/" + file_name +
                    "_words " + str(no_below) + "-" + str(no_above) +
                    "-all.txt",
                    encoding=encoding_type)

    dt.write1dArray(list(bow_vocab.keys()),
                    "../data/" + data_type + "/bow/names/" + file_name +
                    "all_words_2_no_sw.txt",
                    encoding=encoding_type)
    filtered_bow = filtered_bow.transpose()
    ppmi = sparse_ppmi.convertPPMISparse(filtered_bow)
    filtered_ppmi_sparse = sp.csr_matrix(ppmi).transpose()

    print(filtered_ppmi_fn)
    sp.save_npz(filtered_ppmi_fn, filtered_ppmi_sparse)

    if data_type == "reuters":
        testAll(["filtered_freq_bow", "filtered_ppmi_bow"], [
            filtered_ppmi_sparse.transpose().todense(),
            filtered_bow.todense()
        ], [classes, classes], data_type)
    else:
        testAll(["filtered_freq_bow", "filtered_ppmi_bow"], [
            filtered_ppmi_sparse.transpose().todense(),
            filtered_bow.todense()
        ], [to_categorical(classes),
            to_categorical(classes)], data_type)

    # Create PCA
    #classes = dt.import2dArray("../data/movies/classify/genres/class-all", "i")
    #bow = sp.csr_matrix(dt.import2dArray("../data/movies/bow/frequency/phrases/class-all-15-5-genres", "i")).transpose()
    ppmi = sparse_ppmi.convertPPMISparse(bow)
    ppmi_sparse = sp.csr_matrix(ppmi).transpose()

    print(ppmi_fn)
    sp.save_npz(ppmi_fn, ppmi_sparse)

    pca_size = [50, 100, 200]
    for p in pca_size:
        pca_fn = "../data/" + data_type + "/nnet/spaces/" + file_name + "_ppmi " + str(
            bowmin) + " S" + str(p) + "-all.npy"
        PCA_ppmi = getPCA(ppmi_sparse, p)
        np.save(pca_fn, PCA_ppmi)
    """
    if grams > 0:
        for i in range(2, grams+1):  # Up to 5-length grams

            filtered_ppmi_fn = "../data/"+data_type+"/bow/ppmi/" + file_name + "_ppmi " + str(
                grams) + "-gram" + str(no_below) + "-" + str(
                no_above) + "-all.npz"
            ppmi_fn = "../data/"+data_type+"/bow/ppmi/" + file_name + "_ppmi " + str(
                grams) + "-gram2" + "-all.npz"
            bow_fn = "../data/"+data_type+"/bow/frequency/phrases/" + file_name + "_bow " + str(
                grams) + "-gram2" + "-all.npz"
            filtered_bow_fn = "../data/"+data_type+"/bow/frequency/phrases/" + file_name + "_bow " + str(
                grams) + "-gram" + str(
                no_below) + \
                              "-" + str(no_above) + "-all.npz"

            processed_corpus, tokenized_corpus = ngrams(tokenized_corpus)
            vocab, dct = getVocab(tokenized_corpus)
            bow = doc2bow(tokenized_corpus, dct, 0)
            filtered_bow, word_list = filterBow(tokenized_corpus, dct, no_below-bowmin, no_above)
            tokenized_ids = tokensToIds(tokenized_corpus, vocab)
            np.save(output_folder + file_name + "_corpus " + str(i) + "-gram" + ".npy", tokenized_corpus)
            np.save(output_folder + file_name + "_tokenized_corpus " + str(i) + "-gram" + ".npy", tokenized_ids)
            np.save(output_folder + file_name + "_vocab " + str(i) + "-gram" + ".npy", vocab)
            dt.write1dArray(processed_corpus, output_folder + file_name + "_corpus_processed " + str(i) + "-gram" + ".txt")

            sp.save_npz(bow_fn, bow)
            sp.save_npz(filtered_bow_fn, filtered_bow)

            dt.write1dArray(word_list, "../data/"+data_type+"/bow/names/" + file_name + "_words "  + str(i) + "-gram"  +
                            str(no_below) + "-" + str(no_above) + "-all.txt")
            filtered_bow = filtered_bow.transpose()
            ppmi = sparse_ppmi.convertPPMISparse(filtered_bow)
            filtered_ppmi_sparse = sp.csr_matrix(ppmi).transpose()
            sp.save_npz(filtered_ppmi_fn, filtered_ppmi_sparse)
            # Create PCA

            bow = bow.transpose()
            ppmi = sparse_ppmi.convertPPMISparse(bow)
            ppmi_sparse = sp.csr_matrix(ppmi).transpose()
            sp.save_npz(ppmi_fn, ppmi_sparse)
            pca_fn = "../data/"+data_type+"/nnet/spaces/" + file_name + "_ppmi " + str(grams) + "-gram" + str(
                no_below) + "-" + str(
                no_above) + "-all.npy"

            PCA_ppmi = getPCA(ppmi_sparse, 100)
            np.save(pca_fn, PCA_ppmi)

    """
    """
    file_name += "_stopwords"
    filtered_ppmi_fn = "../data/"+data_type+"/bow/ppmi/" + file_name + "_ppmi " + str(no_below) + "-" + str(
        no_above) + "-all.npz"
    filtered_bow_fn = "../data/"+data_type+"/bow/frequency/phrases/" + file_name + "_bow " + str(
        no_below) + "-" + str(no_above) + "-all.npz"
    pca_fn = "../data/"+data_type+"/nnet/spaces/" + file_name + "_ppmi " +  str(
        no_below) + "-" + str(
        no_above) + "-all.npy"
    filtered_ppmi_sparse = sp.load_npz(filtered_ppmi_fn)
    PCA_ppmi = np.load(pca_fn)
    filtered_bow = sp.load_npz(filtered_bow_fn)
    """
    # Create averaged word vectors
    if data_type == "reuters":
        testAll(["ppmi_pca"], [PCA_ppmi], [classes], data_type)
    else:
        testAll(["ppmi_pca"], [PCA_ppmi], [to_categorical(classes)], data_type)
    def __init__(self,
                 training_data=10000,
                 class_path=None,
                 network_type="ft",
                 randomize_finetune_weights=False,
                 dropout_noise=None,
                 amount_of_hidden=0,
                 epochs=1,
                 learn_rate=0.01,
                 loss="mse",
                 batch_size=1,
                 past_model_bias_fn=None,
                 identity_swap=False,
                 reg=0.0,
                 amount_of_finetune=1,
                 output_size=25,
                 hidden_activation="tanh",
                 layer_init="glorot_uniform",
                 output_activation="tanh",
                 deep_size=None,
                 corrupt_finetune_weights=False,
                 hidden_layer_size=100,
                 file_name="unspecified_filename",
                 vector_path=None,
                 is_identity=False,
                 activity_reg=0.0,
                 finetune_size=0,
                 data_type="movies",
                 optimizer_name="rmsprop",
                 noise=0.0,
                 fine_tune_weights_fn=None,
                 past_model_weights_fn=None,
                 from_ae=True,
                 class_outputs=False,
                 finetune_activation="linear"):

        self.model = Sequential()
        self.training_data = training_data
        self.class_path = class_path
        self.learn_rate = learn_rate
        self.epochs = epochs
        self.loss = loss
        self.batch_size = batch_size
        self.hidden_activation = hidden_activation
        self.layer_init = layer_init
        self.output_activation = output_activation
        self.hidden_layer_size = hidden_layer_size
        self.file_name = file_name
        self.vector_path = vector_path
        self.dropout_noise = dropout_noise
        self.finetune_size = finetune_size
        self.class_outputs = class_outputs
        self.reg = reg
        self.activity_reg = activity_reg
        self.activity_reg = activity_reg
        self.amount_of_finetune = amount_of_finetune
        self.amount_of_hidden = amount_of_hidden
        self.output_size = output_size
        self.finetune_activation = finetune_activation

        print(data_type)

        if optimizer_name == "adagrad":
            self.optimizer = Adagrad()
        else:
            self.optimizer = SGD(lr=learn_rate,
                                 momentum=0.0,
                                 decay=0.0,
                                 nesterov=False)

        entity_vectors, entity_classes = None, None

        if network_type == "ft":
            entity_vectors, entity_classes = self.fineTuneNetwork(
                past_model_weights_fn, past_model_bias_fn,
                fine_tune_weights_fn, is_identity, identity_swap,
                randomize_finetune_weights, corrupt_finetune_weights,
                deep_size, from_ae)
        elif network_type == "da":
            entity_vectors, entity_classes = self.denoisingAutoencoder(
                noise, deep_size)

        x_train, x_test, y_train, y_test = train_test_split(entity_vectors,
                                                            entity_classes,
                                                            test_size=0.3,
                                                            random_state=0)

        #x_train, y_train = dt.balance2dClasses(x_train, y_train, 1)

        # Compile the model and fit it to the data
        self.model.fit(x_train,
                       y_train,
                       nb_epoch=self.epochs,
                       batch_size=self.batch_size,
                       verbose=1)

        if network_type == "ft":
            if class_outputs:
                scores = []
                y_pred = self.model.predict(x_test)
                y_pred[y_pred >= 0.5] = 1
                y_pred[y_pred < 0.5] = 0
                f1 = f1_score(y_test, y_pred, average="macro")

                accuracy_array = []
                for y in range(len(y_pred)):
                    accuracy_array.append(accuracy_score(y_test[y], y_pred[y]))
                accuracy = np.mean(accuracy_array)

                scores.append(f1)
                scores.append(accuracy)
                dt.write1dArray(
                    scores, "../data/" + data_type + "/nnet/scores/" +
                    self.file_name + ".txt")
                print(scores)
            self.output_clusters = self.model.predict(entity_vectors)
            dt.write2dArray(
                self.output_clusters.transpose(), "../data/" + data_type +
                "/nnet/clusters/" + self.file_name + ".txt")

        total_file_name = "../data/" + data_type + "/nnet/spaces/" + self.file_name
        for l in range(0, len(self.model.layers) - 1):
            if dropout_noise is not None or dropout_noise > 0.0:
                if l % 2 == 1:
                    continue
            print("Writing", l, "layer")
            truncated_model = Sequential()
            for a in range(l + 1):
                truncated_model.add(self.model.layers[a])
            truncated_model.compile(loss=self.loss, optimizer="sgd")
            self.end_space = truncated_model.predict(entity_vectors)
            dt.write2dArray(self.end_space,
                            total_file_name + "L" + str(l) + ".txt")

        for l in range(len(self.model.layers)):
            try:
                dt.write2dArray(
                    self.model.layers[l].get_weights()[0],
                    "../data/" + data_type + "/nnet/weights/L" + str(l) +
                    file_name + ".txt")
                dt.write1dArray(
                    self.model.layers[l].get_weights()[1], "../data/" +
                    data_type + "/nnet/bias/L" + str(l) + file_name + ".txt")
            except IndexError:
                print("Layer ", str(l), "Failed")
예제 #28
0
def  getVectors(input_folder, file_names_fn, extension, output_folder, only_words_in_x_entities,
               words_without_x_entities, cut_first_line=False, get_all=False, additional_name="", make_individual=True,
               classification="", use_all_files="", minimum_words=0, data_type="", sparse_matrix=False, word_count_amt = 0):
    if use_all_files is None:
        file_names = dt.import1dArray(file_names_fn)
    else:
        file_names = dt.getFns(use_all_files)

    phrase_dict = defaultdict(int)
    failed_indexes = []
    failed_filenames = []
    working_filenames = []

    # First, get all possible phrase names and build a dictionary of them from the files

    for f in range(len(file_names)):
        try:
            full_name = input_folder + file_names[f] + "." + extension
            phrase_list = dt.import2dArray(full_name, "s")
            if cut_first_line:
                phrase_list = phrase_list[1:]
            word_count = 0
            for p in phrase_list:
                word_count += int(p[1])
            if word_count > word_count_amt:
                for p in phrase_list:
                    if p[0] != "all":
                        phrase_dict[p[0]] += 1
                    else:
                        print("found class all")
                working_filenames.append(file_names[f])
            else:
                print("Failed, <1k words", file_names[f], f, word_count)
                failed_filenames.append(file_names[f])
                failed_indexes.append(f)
        except FileNotFoundError:
            print("Failed to find", file_names[f], f)
            failed_filenames.append(file_names[f])
            failed_indexes.append(f)
    print(failed_indexes)
    print(failed_filenames)
    phrase_sets = []
    # Convert to array so we can sort it
    phrase_list = []


    entity_names = dt.import1dArray(file_names_fn)
    matching_filenames = []
    failed_fns = []
    if data_type == "wines":
        for e in entity_names:
            found = False
            for f in working_filenames:

                if "zz" in f:
                    new_f = f[2:]
                else:
                    new_f = f
                if dt.removeEverythingFromString(e) == dt.removeEverythingFromString(new_f):
                    matching_filenames.append(f)
                    found = True
                    break
            if not found:
                failed_fns.append(e)

        working_filenames = np.unique(np.asarray(matching_filenames))

    test_dupes = np.unique(np.asarray(working_filenames))
    print(len(test_dupes))

    for key, value in phrase_dict.items():
        if value >= only_words_in_x_entities:
            phrase_list.append(key)
    all_phrases = []
    for key, value in phrase_dict.items():
        all_phrases.append(key)

    phrase_sets.append(phrase_list)
    counter = 0
    for phrase_list in phrase_sets:
        if not get_all and counter > 0:
            break
        all_phrase_fn = output_folder+"frequency/phrases/" + "class-all-" +str(only_words_in_x_entities) + "-"+str(words_without_x_entities)+"-"+ classification
        phrase_name_fn = output_folder + "names/"  +str(only_words_in_x_entities) + "-"+str(words_without_x_entities)+"-"+ classification +".txt"
        phrase_list = sorted(phrase_list)

        print("Found", len(phrase_list), "Phrases")
        print(phrase_list[:20])
        print("Failed", len(failed_filenames), "Files")
        print(failed_filenames[:20])

        phrase_index_dict = defaultdict()

        # Create a dictionary to obtain the index of a phrase that's being checked

        for p in range(len(phrase_list)):
            phrase_index_dict[phrase_list[p]] = p

        # Create an empty 2d array to store a matrix of movies and phrases
        all_phrases_complete = []
        for f in working_filenames:
            all_phrases_complete.append([0]*len(phrase_list))

        all_phrases_complete = np.asarray(all_phrases_complete)

        print("Each entity is length", len(all_phrases_complete[0]))
        print("The overall matrix is", len(all_phrases_complete))
        if sparse_matrix:
            all_phrases_complete = sp.csr_matrix(all_phrases_complete)


        # Then, populate the overall bag of words for each film (with all other phrases already set to 0

        completed_index = []

        if data_type == "wines":

            print("wines")
            """
            merge_indexes = []
            for f in range(len(working_filenames)):
                print(working_filenames[f])
                for i in range(len(working_filenames)):
                    if i == f:
                        continue
                    for ci in completed_index:
                        if i == ci:
                            continue
                    if "~" in working_filenames[i]:
                        if working_filenames[f] == working_filenames[i][:-1] or working_filenames[f] == working_filenames[i][2:-1]:
                            completed_index.append(i)
                            merge_indexes.append((f, i))
            """

        for f in range(len(working_filenames)):
            n_phrase_list = dt.import2dArray(input_folder + working_filenames[f] + "." + extension, "s")
            if cut_first_line:
                n_phrase_list = n_phrase_list[1:]
            for p in n_phrase_list:
                phrase = p[0]
                try:
                    phrase_index = phrase_index_dict[phrase]
                    if not sparse_matrix:
                        all_phrases_complete[f][phrase_index] = int(p[1])
                    else:
                        all_phrases_complete[f, phrase_index] = int(p[1])

                    #print("Kept", phrase)
                except KeyError:
                    continue
                    #print("Deleted phrase", phrase)
        """

        cols_to_delete = []
        if data_type == "wines":
            for mt in merge_indexes:
                for v in range(len(all_phrases_complete)):
                    all_phrases_complete[v][mt[0]] += all_phrases_complete[v][mt[1]]
                cols_to_delete.append(mt[1])
        all_phrases_complete = np.delete(all_phrases_complete, cols_to_delete, 1)
        working_filenames = np.delete(working_filenames, cols_to_delete)
        """

        # Import entities specific to the thing
        # Trim the phrases of entities that aren't included in the classfication
        if classification != "all" and classification != "mixed" and classification != "genres" and classification != "ratings" and classification != "types":
            classification_entities = dt.import1dArray("../data/" + data_type + "/classify/" + classification + "/available_entities.txt")
            all_phrases_complete = dt.match_entities(all_phrases_complete, classification_entities, file_names)
        elif classification == "all":
            print("All~~~~~~~~~~~~~~")
            dt.write1dArray(working_filenames, "../data/"+data_type+"/classify/"+classification+"/available_entities.txt")
        if not sparse_matrix:
            all_phrases_complete = np.asarray(all_phrases_complete).transpose()
        else:
            all_phrases_complete = all_phrases_complete.transpose()

        indexes_to_delete = []
        if sparse_matrix:
            cx = sp.coo_matrix(all_phrases_complete)

            indexes_to_delete = []

            for i, j, v in zip(cx.row, cx.col, cx.data):
                print
                "(%d, %d), %s" % (i, j, v)
        for a in range(len(all_phrases_complete)):
            if np.count_nonzero(all_phrases_complete[a]) > len(all_phrases_complete[a]) - (words_without_x_entities):
                print("Recorded an entity " + str(phrase_list[a]) + " with too little difference")
                indexes_to_delete.append(a)
        indexes_to_delete.sort()
        indexes_to_delete.reverse()
        for i in indexes_to_delete:
            all_phrases_complete = np.delete(all_phrases_complete, i, 0)
            print("Deleted an entity " + str(phrase_list[i]) + " with too little difference")
            phrase_list = np.delete(phrase_list, i, 0)

        dt.write1dArray(phrase_list, phrase_name_fn)
        if make_individual:
            for p in range(len(all_phrases_complete)):
                dt.write1dArray(all_phrases_complete[p], output_folder+"frequency/phrases/class-" + phrase_list[p] +
                                 "-"+str(only_words_in_x_entities) + "-"+str(words_without_x_entities)+"-"+ classification)



        dt.write2dArray(all_phrases_complete, all_phrase_fn)


        print("Created class-all")
        all_phrases_complete = np.asarray(all_phrases_complete).transpose()
        for a in range(len(all_phrases_complete)):
            for v in range(len(all_phrases_complete[a])):
                if all_phrases_complete[a][v] > 1:
                    all_phrases_complete[a][v] = 1

        all_phrases_complete = np.asarray(all_phrases_complete).transpose()

        if make_individual:
            for p in range(len(all_phrases_complete)):
                dt.write1dArray(all_phrases_complete[p], output_folder+"binary/phrases/class-" + phrase_list[p] +
                                "-"+str(only_words_in_x_entities) + "-"+str(words_without_x_entities)+"-"+ classification)



        all_phrase_fn = output_folder + "binary/phrases/" + "class-all-" + str(
            only_words_in_x_entities) + "-" + str(words_without_x_entities) + "-" + classification
        dt.write2dArray(all_phrases_complete, all_phrase_fn)

        print("Created class-all binary")
        counter += 1
예제 #29
0
def regularNewsgroupsStuff():  # Rename later
    classification = "all"
    highest_amt = 18836

    lowest_amt = 30
    all_fn = "../data/newsgroups/bow/frequency/phrases/class-all-" + str(
        lowest_amt) + "-" + str(highest_amt) + "-" + classification
    #newsgroups_train = fetch_20newsgroups(subset='train', shuffle=False, remove=("headers", "footers", "quotes"))
    #newsgroups_test = fetch_20newsgroups(subset='test', shuffle=False, remove=("headers", "footers", "quotes"))

    all = fetch_20newsgroups(subset='all',
                             shuffle=False,
                             remove=("headers", "footers", "quotes"))

    train_len = len(all.data)

    print(all.target[train_len - 1])
    print(all.target[train_len - 2])
    print(all.target[train_len - 3])
    print(all.target[0])
    print(all.target[1])
    print(all.target[2])

    vectors = all.data
    classes = all.target

    ac_x_train = vectors[:11314]
    ac_x_test = vectors[11314:]
    ac_y_train = classes[:11314]
    ac_y_test = classes[11314:]

    print(classes[train_len - 1])
    print(classes[train_len - 2])
    print(classes[train_len - 3])

    tf_vectorizer = CountVectorizer(max_df=highest_amt,
                                    min_df=lowest_amt,
                                    stop_words='english')
    print("completed vectorizer")
    tf = tf_vectorizer.fit(vectors)
    feature_names = tf.get_feature_names()
    dt.write1dArray(
        feature_names, "../data/newsgroups/bow/names/" + str(lowest_amt) +
        "-" + str(highest_amt) + "-" + classification + ".txt")
    dict = tf.vocabulary_
    tf = tf_vectorizer.transform(vectors)
    dense = FunctionTransformer(lambda x: x.todense(), accept_sparse=True)
    tf = dense.fit_transform(tf)
    tf = np.squeeze(np.asarray(tf))
    tf = np.asarray(tf, dtype=np.int32)
    tf = tf.transpose()
    freqs = []
    for t in tf:
        freq = 0
        for i in range(len(t)):
            if t[i] != 0:
                freq += t[i]
        freqs.append(freq)
    print("Amount of terms:", len(tf))
    dt.write1dArray(
        freqs, "../data/newsgroups/bow/freq_count/" + str(lowest_amt) + "-" +
        str(highest_amt))
    #dt.write2dArray(tf, all_fn)
    #mt.printIndividualFromAll("newsgroups",  "frequency/phrases", lowest_amt, highest_amt, classification, all_fn=all_fn, names_array=feature_names)
    ppmi_fn = "../data/newsgroups/bow/ppmi/class-all-" + str(
        lowest_amt) + "-" + str(highest_amt) + "-" + classification
    #if dt.fileExists(ppmi_fn) is False:
    tf = sp.csr_matrix(tf)
    sp.save_npz(all_fn, tf)
    ppmi = mt.convertPPMI(tf)
    #dt.write2dArray(ppmi, ppmi_fn)
    ppmi_sparse = sp.csr_matrix(ppmi)
    sp.save_npz(ppmi_fn, ppmi_sparse)
    mt.printIndividualFromAll("newsgroups",
                              "ppmi",
                              lowest_amt,
                              highest_amt,
                              classification,
                              all_fn=all_fn,
                              names_array=feature_names)

    print("1")
    classes = np.asarray(classes, dtype=np.int32)
    print(2)
    classes_dense = np.zeros(shape=(len(classes), np.amax(classes) + 1),
                             dtype=np.int8)
    print(3)
    for c in range(len(classes)):
        classes_dense[c][classes[c]] = 1
    print(4)
    names = list(all.target_names)
    dt.write1dArray(names, "../data/newsgroups/classify/newsgroups/names.txt")
    classes_dense = classes_dense.transpose()
    for c in range(len(classes_dense)):
        dt.write1dArray(
            classes_dense[c],
            "../data/newsgroups/classify/newsgroups/class-" + names[c])
    classes_dense = classes_dense.transpose()

    dt.write2dArray(classes_dense,
                    "../data/newsgroups/classify/newsgroups/class-all")

    feature_names = dt.import1dArray("../data/newsgroups/bow/names/" +
                                     str(lowest_amt) + "-" + str(highest_amt) +
                                     "-all.txt")

    freq = dt.import2dArray(all_fn)

    binary = np.zeros(shape=(len(freq), len(freq[0])))
    for i in range(len(freq)):
        for j in range(len(freq[i])):
            if freq[i][j] > 0:
                binary[i][j] = 1
    binary_all_fn = "../data/newsgroups/bow/binary/phrases/class-all-" + str(
        lowest_amt) + "-" + str(highest_amt) + "-" + classification
    binary = sp.csr_matrix(binary)
    sp.save_npz(binary_all_fn, binary)
    #dt.write2dArray(binary, binary_all_fn)

    #mt.printIndividualFromAll("newsgroups",  "binary/phrases", lowest_amt, highest_amt, classification, all_fn=all_fn, names_array=feature_names)
    #ppmi_fn = "../data/newsgroups/bow/ppmi/class-all-"+str(lowest_amt)+"-"+str(highest_amt)+"-" + classification


#regularNewsgroupsStuff()
예제 #30
0
def importCertificates(cert_fn, entity_name_fn):
    all_lines = dt.import1dArray(cert_fn)[14:]
    en = dt.import1dArray(entity_name_fn)
    original_en = dt.import1dArray(entity_name_fn)
    en_name = []
    en_year = []
    for e in range(len(en)):
        split = en[e].split()
        en_year.append(split[len(split)-1])
        name = "".join(split[:len(split)-1])
        en_name.append(dt.removeEverythingFromString(name))


    # Initialize ratings dict
    """
    ratings = {
        "USA:G": [],
        "USA:PG": [],
        "USA:PG-13": [],
        "USA:R": []
    }
    """
    ratings = {
        "UK:PG": [],
        "UK:12": [],
        "UK:12A": [],
        "UK:15": [],
        "UK:18": [],
    }

    all_ratings = defaultdict(list)
    recently_found_name = ""
    recently_found_year = ""
    recently_found_found = False
    counter = 0

    temp_fn = "../data/temp/uk_cert_dict.pickle"

    if dt.fileExists(temp_fn) is False:
        for line in all_lines:
            line = line.split("\t")
            split_ny = line[0].split("{")[0]
            split_ny = split_ny.split()
            for i in range(len(split_ny)-1, -1, -1):
                if "{" in split_ny[i]:
                    del split_ny[i]
            entity_year_bracketed = split_ny[len(split_ny)-1]

            if "(V)" in entity_year_bracketed or "(TV)" in entity_year_bracketed or "(VG)" in entity_year_bracketed:
                entity_year_bracketed = split_ny[len(split_ny) - 2]
            try:
                entity_year = dt.keepNumbers(entity_year_bracketed)[0]
                entity_name = dt.removeEverythingFromString("".join(split_ny[:len(split_ny)-1]))
                found = False
                skip = False
                if recently_found_name == entity_name and recently_found_year == entity_year:
                    skip = True
                    found = recently_found_found
                if not skip:
                    if not found:
                        for n in range(len(en_name)):
                            if entity_name == en_name[n] and entity_year == en_year[n]:
                                print("found", entity_name, entity_year)
                                found = True
                                break
                if found:
                    if("(" not in line[len(line)-1]):
                        entity_rating = line[len(line)-1]
                    else:
                        entity_rating = line[len(line)-2]
                    all_ratings[entity_rating].append(entity_name)
                    if entity_rating in ratings:
                        ratings[entity_rating].append(entity_name)
                        print("rating correct", entity_name, entity_year, entity_rating)
            except IndexError:
                print("IndexError")
                print(line)
                print(split_ny)
                print(entity_year_bracketed)
            recently_found_name = entity_name
            recently_found_year = entity_year
            recently_found_found = found
            counter += 1
            if counter % 1000 == 0:
                    print(counter)
        # Store data (serialize)
        with open(temp_fn, 'wb') as handle:
            pickle.dump(ratings, handle, protocol=pickle.HIGHEST_PROTOCOL)        # Store data (serialize)
        with open("../data/temp/uk_cert_dict_all.pickle", 'wb') as handle:
            pickle.dump(all_ratings, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # Load data (deserialize)
    with open(temp_fn, 'rb') as handle:
        ratings = pickle.load(handle)
    if dt.fileExists("../data/temp/uk_cert_dict_all.pickle"):
        with open("../data/temp/uk_cert_dict_all.pickle", 'rb') as handle:
            all_ratings = pickle.load(handle)

    top_size = 0
    for key, value in all_ratings.items():
        top_size += len(value)
    print(top_size)
    top_size = 0

    new_ratings = defaultdict(list)
    real_name_dict_fn = "../data/temp/uk_real_name_dict.dict"
    if dt.fileExists(real_name_dict_fn) is False:
        # Match the names back to the original names
        for key, value in all_ratings.items():
            for r in ratings:
                if r == key:
                    top_size += len(value)
                    for v in range(len(value)):
                        found = False
                        for n in range(len(en_name)):
                            if value[v] == en_name[n]:
                                found = True
                                value[v] = original_en[n]
                                break
                        if found:
                            new_ratings[key].append(value[v])
                    break
        with open(real_name_dict_fn, 'wb') as handle:
            pickle.dump(new_ratings, handle, protocol=pickle.HIGHEST_PROTOCOL)
    else:
        with open(real_name_dict_fn, 'rb') as handle:
            new_ratings = pickle.load(handle)
                # Get the final dict setup
    """
    final_dict = {
        "USA-G": [],
        "USA-PG-PG13": [],
        "USA-R": [],
    }
    """
    final_dict = {
        "UK-PG": [],
        "UK-12-12A": [],
        "UK-15": [],
        "UK-18": []
    }

    # Append the final dict ratings

    final_dict["UK-PG"].extend(all_ratings["UK:PG"])
    final_dict["UK-12-12A"].extend(all_ratings["UK:12"])
    final_dict["UK-12-12A"].extend(all_ratings["UK:12A"])
    final_dict["UK-15"].extend(all_ratings["UK:15"])
    final_dict["UK-18"].extend(all_ratings["UK:18"])
    """
    final_dict["USA-G"].extend(all_ratings["USA:G"])
    final_dict["USA-PG-PG13"].extend(all_ratings["USA:PG"])
    final_dict["USA-PG-PG13"].extend(all_ratings["USA:PG13"])
    final_dict["USA-R"].extend(all_ratings["USA:R"])
    """
    """
    final_name_dict = {
        "USA-G": [],
        "USA-PG-PG13": [],
        "USA-R": [],

    }
    """
    final_name_dict = {
        "UK-PG": [],
        "UK-12-12A": [],
        "UK-15": [],
        "UK-18": [],
    }

    # Append the final dict good names

    final_name_dict["UK-PG"].extend(new_ratings["UK:PG"])
    final_name_dict["UK-12-12A"].extend(new_ratings["UK:12"])
    final_name_dict["UK-12-12A"].extend(new_ratings["UK:12A"])
    final_name_dict["UK-15"].extend(new_ratings["UK:15"])
    final_name_dict["UK-18"].extend(new_ratings["UK:18"])
    """
    final_name_dict["USA-G"].extend(new_ratings["USA:G"])
    final_name_dict["USA-PG-PG13"].extend(new_ratings["USA:PG"])
    final_name_dict["USA-PG-PG13"].extend(new_ratings["USA:PG13"])
    final_name_dict["USA-R"].extend(new_ratings["USA:R"])
    """

    # Create a unique list of the entities found
    entities_found = []
    for key, items in new_ratings.items():
        for i in items:
            entities_found.append(i)
    entities_found = np.unique(entities_found)
    print(len(entities_found))


    # Get the en_names back...
    jacked_up_entities_found = []
    for n in entities_found:
        new_n = n.split()[:-1]
        jacked_up_entities_found.append(dt.removeEverythingFromString(" ".join(new_n)))

    classes = [[0]*len(entities_found),[0]*len(entities_found),[0]*len(entities_found),[0]*len(entities_found)]
    counter = 0
    class_names = []
    for key, items in final_dict.items():
        for i in items:
            for e in range(len(jacked_up_entities_found)):
                if i == jacked_up_entities_found[e]:
                    classes[counter][e] = 1
        class_names.append(key)
        counter += 1

    classes = np.asarray(classes).transpose()

    indexes_to_delete = []

    for c in range(len(classes)):
        found = False
        for i in classes[c]:
            if i == 1:
                found = True
                break
        if not found:
            indexes_to_delete.append(c)

    classes = np.delete(classes, indexes_to_delete, axis=0)
    entities_found = np.delete(entities_found, indexes_to_delete)

    classes = classes.transpose()

    for c in range(len(classes)):
        dt.write1dArray(classes[c], "../data/movies/classify/uk-ratings/class-" + class_names[c])

    classes = classes.transpose()

    dt.write2dArray(classes, "../data/movies/classify/uk-ratings/class-all")
    dt.write1dArray(entities_found, "../data/movies/classify/uk-ratings/available_entities.txt")
    dt.write1dArray(class_names, "../data/movies/classify/uk-ratings/names.txt")
    print("k")