def selectCutOffByWordVector(cutoff_fn, cluster_dict_fn, file_name): cutoff = dt.import2dArray(cutoff_fn) cluster_dict = dt.readArrayDict(cluster_dict_fn) cutoff_words = [] wv, wvn = dt.getWordVectors() cluster_boundary = 2 cluster_dict_arrays = [] for key, value in cluster_dict.items(): cluster_array = [] cluster_array.append(key) for v in value: cluster_array.append(v) cluster_dict_arrays.append(cluster_array) for c in range(len(cutoff)): clusters = [] for i in range(len(cutoff[c])): cluster = [] for x in range(len(cutoff[c])-1, -1, -1): if cutoff[c][x] is None or cutoff[c][i] is None: continue if abs(cutoff[c][i] - cutoff[c][x]) <= cluster_boundary: cluster.append(cluster_dict_arrays[c][x]) cutoff[c][x] = None cluster_dict_arrays[c][x] = None if cluster is []: continue clusters.append(cluster) # Get the maximum similarity word vector value for each cluster, across all clusters for cl in range(len(clusters)): for wa in range(len(clusters[cl])): for w in range(len(clusters[cl][wa])): clusters[cl[wa]] dt.write2dArray(cutoff_words, "../data/movies/rules/cutoff/"+file_name+"WVN.txt")
def getAllPhraseRankings(directions_fn=None, vectors_fn=None, property_names_fn=None, vector_names_fn=None, fn="no filename", percentage_increment=1, scores_fn = None, top_amt=0, discrete=False, data_type="movies", rewrite_files=False): rankings_fn_all = "../data/" + data_type + "/rank/numeric/" + fn + "ALL.txt" all_fns = [rankings_fn_all] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", "getAllPhraseRankings") return else: print("Running task", "getAllPhraseRankings") directions = dt.import2dArray(directions_fn) vectors = dt.import2dArray(vectors_fn) property_names = dt.import1dArray(property_names_fn) vector_names = dt.import1dArray(vector_names_fn) if top_amt != 0: scores = dt.import1dArray(scores_fn, "f") directions = dt.sortByReverseArray(directions, scores)[:top_amt] property_names = dt.sortByReverseArray(property_names, scores)[:top_amt] rankings = getRankings(directions, vectors, property_names, vector_names) if discrete: discrete_labels = createDiscreteLabels(rankings, percentage_increment) discrete_labels = np.asarray(discrete_labels) for a in range(len(rankings)): rankings[a] = np.around(rankings[a], decimals=4) #dt.write1dArray(property_names, "../data/movies/bow/names/top5kof17k.txt") dt.write2dArray(rankings, rankings_fn_all)
def fixCutoffFormatting(cutoff_fn, file_name): cutoff = dt.import1dArray(cutoff_fn) cluster_dict = dt.readArrayDict(cluster_dict_fn) for c in range(len(cutoff)): cutoff[c] = cutoff[c].split() for i in range(len(cutoff[c])): cutoff[c][i] = int(dt.stripPunctuation(cutoff[c][i])) dt.write2dArray(cutoff, "../data/movies/rules/cutoff/" +file_name+ ".txt")
def saveClusters(directions_fn, scores_fn, names_fn, filename, amt_of_dirs, data_type, cluster_amt, rewrite_files=False, algorithm="meanshift_k"): dict_fn = "../data/" + data_type + "/cluster/dict/" + filename + ".txt" cluster_directions_fn = "../data/" + data_type + "/cluster/clusters/" + filename + ".txt" all_fns = [dict_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", saveClusters.__name__) return else: print("Running task", saveClusters.__name__) p_dir = dt.import2dArray(directions_fn) p_names = dt.import1dArray(names_fn, "s") p_scores = dt.import1dArray(scores_fn, "f") ids = np.argsort(p_scores) p_dir = np.flipud(p_dir[ids])[:amt_of_dirs] p_names = np.flipud(p_names[ids])[:amt_of_dirs] if algorithm == "meanshift": labels = meanShift(p_dir) else: labels = kMeans(p_dir, cluster_amt) unique, counts = np.unique(labels, return_counts=True) clusters = [] dir_clusters = [] for i in range(len(unique)): clusters.append([]) dir_clusters.append([]) for i in range(len(labels)): clusters[labels[i]].append(p_names[i]) dir_clusters[labels[i]].append(p_dir[i]) cluster_directions = [] for l in range(len(dir_clusters)): cluster_directions.append(dt.mean_of_array(dir_clusters[l])) print("------------------------") for c in clusters: print(c) print("------------------------") dt.write2dArray(clusters, dict_fn) dt.write2dArray(cluster_directions, cluster_directions_fn)
def main(data_type, clf, highest_amt, lowest_amt, depth, rewrite_files): min = lowest_amt max = highest_amt dm_fn = "../data/" + data_type + "/mds/class-all-" + str(min) + "-" + str(max) \ + "-" + clf + "dm" dm_shorten_fn = "../data/" + data_type + "/mds/class-all-" + str(min) + "-" + str(max) \ + "-" + clf + "dmround" mds_fn = "../data/"+data_type+"/mds/class-all-" + str(min) + "-" + str(max) \ + "-" + clf+ "d" + str(depth) svd_fn = "../data/"+data_type+"/svd/class-all-" + str(min) + "-" + str(max) \ + "-" + clf + "d" + str(depth) pca_fn = "../data/"+data_type+"/pca/class-all-" + str(min) + "-" + str(max) \ + "-" + clf + "d" + str(depth) shorten_fn = "../data/" + data_type + "/bow/ppmi/class-all-" + str(min) + "-" + str(max) \ + "-" + clf+ "round" term_frequency_fn = init_vector_path = "../data/" + data_type + "/bow/ppmi/class-all-" + str(min) + "-" + str(max) \ + "-" + clf if dt.allFnsAlreadyExist([dm_fn, mds_fn, svd_fn, shorten_fn]): print("all files exist") exit() if dt.fileExists(dm_fn) is False: newsgroups_train = fetch_20newsgroups(subset='train', shuffle=False) newsgroups_test = fetch_20newsgroups(subset='test', shuffle=False) vectors = np.concatenate((newsgroups_train.data, newsgroups_test.data), axis=0) newsgroups_test = None newsgroups_train = None # Get sparse tf rep tf_vectorizer = CountVectorizer(max_df=highest_amt, min_df=lowest_amt, stop_words='english') print("completed vectorizer") tf = tf_vectorizer.fit_transform(vectors) vectors = None # Get sparse PPMI rep from sparse tf rep print("done ppmisaprse") sparse_ppmi = convertPPMISparse(tf) # Get sparse Dsim matrix from sparse PPMI rep dm = getDissimilarityMatrixSparse(sparse_ppmi) dt.write2dArray(dm, dm_fn) else: dm = dt.import2dArray(dm_fn) print("starting mds") # Use as input to mds mds = createMDS(dm, depth) # save MDS dt.write2dArray(mds, mds_fn)
def selectCutOffByExplanation(cutoff_fn, cluster_dict_fn, file_name): cutoff = dt.import2dArray(cutoff_fn) dupe_cutoff = copy.deepcopy(cutoff) cluster_dict = dt.readArrayDict(cluster_dict_fn) cutoff_words = [] cluster_boundary = 2 cluster_dict_arrays = [] for key, value in cluster_dict.items(): cluster_array = [] cluster_array.append(key) for v in value: cluster_array.append(v) cluster_dict_arrays.append(cluster_array) explanations = [] explanation_cutoffs = [] for c in range(len(cutoff)): clusters = [] for i in range(len(cutoff[c])): cluster = [] for x in range(len(cutoff[c])-1, -1, -1): if cutoff[c][x] is None or cutoff[c][i] is None: continue if abs(cutoff[c][i] - cutoff[c][x]) <= cluster_boundary: cluster.append(cluster_dict_arrays[c][x]) cutoff[c][x] = None cluster_dict_arrays[c][x] = None if cluster is []: continue clusters.append(cluster) # Get the m vvcaximum similarity word vector value for each cluster, across all clusters # For each cluster explained_cutoff = [] explained_cutoff_value = [] for cl in range(len(clusters)): if len(clusters[cl]) == 0: print ("Skipped") continue cluster_explanation, winning_index = webapi.getHighestScore(clusters[cl]) explained_cutoff.append(cluster_explanation+",") dict_index = 0 for h in range(len(cluster_dict_arrays[cl])): if cluster_dict_arrays[cl][h] == clusters[cl][winning_index]: dict_index = h explained_cutoff_value.append(dupe_cutoff[cl][dict_index]) explanations.append(explained_cutoff) explanation_cutoffs.append(explained_cutoff_value) dt.write2dArray(explanations, "../data/movies/rules/final_names/"+file_name+"WVN.txt") dt.write2dArray(explanation_cutoffs, "../data/movies/rules/final_cutoff/"+file_name+".txt")
def makePPMI(names_fn, scores_fn, amt, data_type, ppmi_fn, name_fn): scores = np.asarray(dt.import1dArray(scores_fn, "f")) names = np.asarray(dt.import1dArray(names_fn)) names = names[np.flipud(np.argsort(scores))][:amt] if dt.allFnsAlreadyExist([ppmi_fn, name_fn]) is False: ppmi_file = [] for name in names: ppmi_file.append( dt.import1dArray("../data/" + data_type + "/bow/ppmi/" + "class-" + name + "-100-10-all")) dt.write2dArray(ppmi_file, ppmi_fn) dt.write1dArray(names, name_fn) else: print("already_made PPMI of this size")
def getDissimilarityMatrixSparse(tf): tflen = tf.shape[0] dm = np.empty([tflen, tflen], dtype="float64") pithing = 2 / pi norms = np.empty(tflen, dtype="float64") #Calculate norms for ei in range(tflen): norms[ei] = spl.norm(tf[ei]) print("norm", ei) dot_product = np.zeros([tflen, tflen], dtype="float64") use_old_dp = True if use_old_dp: dot_product = dt.import2dArray("dotproduct.temp") else: #Calculate dot products for ei in range(tflen): for ej in range(tflen): if dot_product[ej][ei] != 0: dot_product[ei][ej] = dot_product[ej][ei] continue dot_product[ei][ej] = tf[ei].dot(tf[ej].T)[0, 0] print("dp", ei) dt.write2dArray(dot_product, "dotproduct.temp") norm_multiplied = np.empty([tflen, tflen], dtype="float64") # Calculate dot products for ei in range(tflen): for ej in range(tflen): norm_multiplied[ei][ej] = norms[ei] * norms[ej] print("dp", ei) norm_multiplied = dt.shortenFloatsNoFn(norm_multiplied) dot_product = dt.shortenFloatsNoFn(dot_product) #Get angular differences for ei in range(tflen): for ej in range(tflen): ang = pithing * np.arccos( dot_product[ei][ej] / norm_multiplied[ei][ej]) dm[ei][ej] = ang print(ei) return dm
def getCutOff(cluster_dict_fn, rankings_fn, file_name): cluster_dict = dt.readArrayDict(cluster_dict_fn) rankings = dt.importDiscreteVectors(rankings_fn) for r in rankings: for a in range(len(r)): r[a] = int(r[a][:-1]) cutoff_clusters = [] counter = 0 for key, value in cluster_dict.items(): value.insert(0, key) cutoffs = [] for v in value: max_score = 0 cutoff = 0 for i in range(1, 101): y_pred = [] for ve in range(len(rankings[counter])): rank = rankings[counter][ve] if rank > i: y_pred.append(0) else: y_pred.append(1) y_test = dt.import2dArray("../data/movies/bow/frequency/phrases/class-"+v, "s") score = cohen_kappa_score(y_test, y_pred) print(v, int(i), "Score", score) if score > max_score: max_score = score cutoff = i cutoffs.append(cutoff) print("Cutoff for", v, "On", key, "Was", str(cutoff)) cutoff_clusters.append(cutoffs) counter+=1 dt.write2dArray(cutoff_clusters, "../data/movies/rules/cutoff/"+file_name+".txt")
def getAllRankings(directions_fn, vectors_fn, cluster_names_fn, vector_names_fn, percent, percentage_increment, by_vector, fn, discrete=True, data_type="movies", rewrite_files=False): #labels_fn = "../data/"+data_type+"/rank/labels/" + fn + ".txt" rankings_fn = "../data/"+data_type+"/rank/numeric/" + fn + ".txt" #discrete_labels_fn = "../data/"+data_type+"/rank/discrete/" + fn + ".txt" all_fns = [rankings_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: for f in all_fns: print(f, "Already exists") print("Skipping task", "getAllRankings") return else: print("Running task", "getAllRankings") directions = dt.import2dArray(directions_fn) vectors = dt.import2dArray(vectors_fn) cluster_names = dt.import1dArray(cluster_names_fn) vector_names = dt.import1dArray(vector_names_fn) rankings = getRankings(directions, vectors, cluster_names, vector_names) rankings = np.asarray(rankings) if discrete: labels = createLabels(rankings, percent) labels = np.asarray(labels) discrete_labels = createDiscreteLabels(rankings, percentage_increment) discrete_labels = np.asarray(discrete_labels) if by_vector: labels = labels.transpose() if discrete: discrete_labels = discrete_labels.transpose() rankings = rankings.transpose() if discrete: dt.write2dArray(labels, labels_fn) dt.write2dArray(rankings, rankings_fn) if discrete: dt.write2dArray(discrete_labels, discrete_labels_fn)
def __init__(self, vector_path, class_path, property_names_fn, file_name, svm_type, training_size=10000, lowest_count=200, highest_count=21470000, get_kappa=True, get_f1=True, single_class=True, data_type="movies", getting_directions=True, threads=1, chunk_amt=0, chunk_id=0, rewrite_files=False, classification="all", loc="../data/", logistic_regression=False, sparse_array_fn=None, only_these_fn=None): self.get_kappa = True self.get_f1 = get_f1 self.data_type = data_type self.classification = classification self.lowest_amt = lowest_count self.higher_amt = highest_count if chunk_amt > 0: file_name = file_name + " CID" + str(chunk_id) + " CAMT" + str( chunk_amt) directions_fn = loc + data_type + "/svm/directions/" + file_name + ".txt" ktau_scores_fn = loc + data_type + "/svm/f1/" + file_name + ".txt" kappa_fn = loc + data_type + "/svm/kappa/" + file_name + ".txt" acc_fn = loc + data_type + "/svm/acc/" + file_name + ".txt" TP_fn = loc + data_type + "/svm/stats/TP " + file_name + ".txt" FP_fn = loc + data_type + "/svm/stats/FP " + file_name + ".txt" TN_fn = loc + data_type + "/svm/stats/TN " + file_name + ".txt" FN_fn = loc + data_type + "/svm/stats/FN " + file_name + ".txt" all_fns = [directions_fn, kappa_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", "getSVMResults") return else: print("Running task", "getSVMResults") y_train = 0 y_test = 0 vectors = np.asarray(dt.import2dArray(vector_path)) print("imported vectors") if not getting_directions: classes = np.asarray(dt.import2dArray(class_path)) print("imported classes") property_names = dt.import1dArray(property_names_fn) print("imported propery names") if chunk_amt > 0: if chunk_id == chunk_amt - 1: chunk = int(len(property_names) / chunk_amt) multiply = chunk_amt - 1 property_names = property_names[chunk * multiply:] else: property_names = dt.chunks( property_names, int( (len(property_names) / chunk_amt)))[chunk_id] if sparse_array_fn is not None: sparse_array = dt.import2dArray(sparse_array_fn) else: sparse_array = None if sparse_array is not None: for s in range(len(sparse_array)): if len(np.nonzero(sparse_array[s])[0]) <= 1: print("WILL FAIL", s, len(np.nonzero(sparse_array[s])[0])) else: print(len(np.nonzero(sparse_array[s])[0])) if not getting_directions: x_train, x_test, y_train, y_test = train_test_split(vectors, classes, test_size=0.3, random_state=0) else: x_train = vectors x_test = vectors if get_f1: y_train = y_train.transpose() y_test = y_test.transpose() print("transpoosed") self.x_train = x_train self.x_test = x_test self.y_train = y_train self.y_test = y_test if only_these_fn is not None: only_these = dt.import1dArray(only_these_fn, "s") inds = [] for s in range(len(property_names)): for o in only_these: if property_names[s] == o: inds.append(s) break sparse_array = sparse_array[inds] property_names = property_names[inds] if self.get_f1 is False: print("running svms") kappa_scores, directions, f1_scores, property_names, accs, TPs, FPs, TNs, FNs = self.runAllSVMs( y_test, y_train, property_names, file_name, svm_type, getting_directions, threads, logistic_regression, sparse_array) dt.write1dArray(kappa_scores, kappa_fn) dt.write2dArray(directions, directions_fn) dt.write1dArray(f1_scores, ktau_scores_fn) dt.write1dArray(accs, acc_fn) dt.write1dArray(TPs, TP_fn) dt.write1dArray(FPs, FP_fn) dt.write1dArray(TNs, TN_fn) dt.write1dArray(FNs, FN_fn) dt.write1dArray(property_names, property_names_fn + file_name + ".txt") else: final_f1 = [] final_acc = [] for y in range(len(y_train)): f1, acc = self.runClassifySVM(y_test[y], y_train[y]) print(f1, acc) final_f1.append(f1) final_acc.append(acc) dt.write1dArray(final_f1, ktau_scores_fn) dt.write1dArray(final_acc, acc_fn)
def getClusters(directions_fn, scores_fn, names_fn, is_gini, amt_high_directions, amt_low_directions, filename, amt_of_clusters, high_threshold, low_threshold, data_type, rewrite_files=False, half_kappa_half_ndcg="", dont_cluster=0): cluster_names_fn = "../data/" + data_type + "/cluster/first_terms/" + filename + ".txt" clusters_fn = "../data/" + data_type + "/cluster/first_term_clusters/" + filename + ".txt" dict_fn = "../data/" + data_type + "/cluster/dict/" + filename + ".txt" cluster_directions_fn = "../data/" + data_type + "/cluster/clusters/" + filename + ".txt" all_fns = [cluster_names_fn, clusters_fn, dict_fn, cluster_directions_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", getClusters.__name__) return else: print("Running task", getClusters.__name__) hdn, ldn, hd, ld = splitDirections(directions_fn, scores_fn, names_fn, is_gini, amt_high_directions, amt_low_directions, high_threshold, low_threshold, half_kappa_half_ndcg) if amt_low_directions != amt_of_clusters: cluster_directions, least_similar_cluster_names, cluster_name_dict, least_similar_clusters = createTermClusters( hd, ld, hdn, ldn, amt_of_clusters, dont_cluster) else: least_similar_clusters = hd cluster_directions = hd least_similar_cluster_names = hdn cluster_name_dict = OrderedDict() for n in hdn: cluster_name_dict[n] = "" #word_vector_names = nameClustersMedoid(cluster_name_dict) additional_text = "" #if is_gini: # additional_text = "gini" """ directions = np.asarray(dt.import2dArray(directions_fn)) names = np.asarray(dt.import1dArray(names_fn)) least_similar_cluster_names.extend(hdn) least_similar_cluster_names.extend(ldn) least_similar_clusters.extend(hd) least_similar_clusters.extend(ld) cluster_center_directions.extend(ld) cluster_center_directions.extend(directions) """ dt.write1dArray(least_similar_cluster_names, cluster_names_fn) dt.write2dArray(least_similar_clusters, clusters_fn) dt.writeArrayDict(cluster_name_dict, dict_fn) #dt.write1dArray(word_vector_names, word_vector_names_fn) dt.write2dArray(cluster_directions, cluster_directions_fn)
def __init__(self, class_path=None, get_scores=False, randomize_finetune_weights=False, dropout_noise=None, amount_of_hidden=0, epochs=1, learn_rate=0.01, loss="mse", batch_size=1, past_model_bias_fn=None, identity_swap=False, reg=0.0, amount_of_finetune=[], output_size=25, hidden_activation="tanh", layer_init="glorot_uniform", output_activation="tanh", deep_size=None, corrupt_finetune_weights=False, split_to_use=-1, hidden_layer_size=100, file_name="unspecified_filename", vector_path=None, is_identity=False, finetune_size=0, data_type="movies", optimizer_name="rmsprop", noise=0.0, fine_tune_weights_fn=None, past_model_weights_fn=None, from_ae=True, save_outputs=False, label_names_fn="", rewrite_files=False, cv_splits=1, cutoff_start=0.2, development=False, class_weight=None, csv_fn=None, tune_vals=False, get_nnet_vectors_path=None, classification_name="all", limit_entities=False, limited_label_fn="", vector_names_fn="", identity_activation="linear", loc="../data/", lock_weights_and_redo=False): weights_fn = loc + data_type + "/nnet/weights/" + file_name + "L0.txt" bias_fn = loc + data_type + "/nnet/bias/" + file_name + "L0.txt" rank_fn = loc + data_type + "/nnet/clusters/" + file_name + ".txt" all_fns = [weights_fn, bias_fn, rank_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", "nnet") return else: print("Running task", "nnet") self.class_path = class_path self.learn_rate = learn_rate self.epochs = epochs self.loss = loss self.batch_size = batch_size self.hidden_activation = hidden_activation self.layer_init = layer_init self.output_activation = output_activation self.hidden_layer_size = hidden_layer_size self.file_name = file_name self.vector_path = vector_path self.dropout_noise = dropout_noise self.finetune_size = finetune_size self.get_scores = get_scores self.reg = reg self.amount_of_finetune = amount_of_finetune self.amount_of_hidden = amount_of_hidden self.output_size = output_size self.identity_swap = identity_swap self.deep_size = deep_size self.from_ae = from_ae self.is_identity = is_identity self.randomize_finetune_weights = randomize_finetune_weights self.corrupt_finetune_weights = corrupt_finetune_weights self.deep_size = deep_size self.fine_tune_weights_fn = fine_tune_weights_fn self.identity_activation = identity_activation self.lock_weights_and_redo = lock_weights_and_redo print(data_type) if optimizer_name == "adagrad": self.optimizer = Adagrad() elif optimizer_name == "sgd": self.optimizer = SGD() elif optimizer_name == "rmsprop": self.optimizer = RMSprop() elif optimizer_name == "adam": self.optimizer = Adam() elif optimizer_name == "adadelta": self.optimizer = Adadelta() else: print("optimizer not found") exit() entity_vectors = np.asarray(dt.import2dArray(self.vector_path)) print("Imported vectors", len(entity_vectors), len(entity_vectors[0])) if get_nnet_vectors_path is not None: nnet_vectors = np.asarray(dt.import2dArray(get_nnet_vectors_path)) print("Imported vectors", len(entity_vectors), len(entity_vectors[0])) entity_classes = np.asarray(dt.import2dArray(self.class_path)) print("Imported classes", len(entity_classes), len(entity_classes[0])) if fine_tune_weights_fn is None: vector_names = dt.import1dArray(vector_names_fn) limited_labels = dt.import1dArray(limited_label_fn) entity_vectors = np.asarray( dt.match_entities(entity_vectors, limited_labels, vector_names)) if fine_tune_weights_fn is not None: if len(entity_vectors) != len(entity_classes): entity_classes = entity_classes.transpose() print("Transposed classes, now in form", len(entity_classes), len(entity_classes[0])) """ # IF Bow if len(entity_vectors[0]) != len(entity_classes[0]): entity_vectors = entity_vectors.transpose() print("Transposed vectors, now in form", len(entity_vectors), len(entity_vectors[0])) """ elif len(entity_vectors) != len(entity_classes): entity_vectors = entity_vectors.transpose() print("Transposed vectors, now in form", len(entity_vectors), len(entity_vectors[0])) self.input_size = len(entity_vectors[0]) self.output_size = len(entity_classes[0]) if fine_tune_weights_fn is not None: model_builder = self.fineTuneNetwork weights = [] if from_ae: self.past_weights = [] past_model_weights = [] for p in past_model_weights_fn: past_model_weights.append( np.asarray(dt.import2dArray(p), dtype="float64")) past_model_bias = [] for p in past_model_bias_fn: past_model_bias.append( np.asarray(dt.import1dArray(p, "f"), dtype="float64")) for p in range(len(past_model_weights)): past_model_weights[p] = np.around(past_model_weights[p], decimals=6) past_model_bias[p] = np.around(past_model_bias[p], decimals=6) for p in range(len(past_model_weights)): self.past_weights.append([]) self.past_weights[p].append(past_model_weights[p]) self.past_weights[p].append(past_model_bias[p]) for f in fine_tune_weights_fn: weights.extend(dt.import2dArray(f)) r = np.asarray(weights, dtype="float64") r = np.asarray(weights, dtype="float64") for a in range(len(r)): r[a] = np.around(r[a], decimals=6) for a in range(len(entity_classes)): entity_classes[a] = np.around(entity_classes[a], decimals=6) self.fine_tune_weights = [] self.fine_tune_weights.append(r.transpose()) self.fine_tune_weights.append( np.zeros(shape=len(r), dtype="float64")) else: model_builder = self.classifierNetwork # Converting labels to categorical f1_scores = [] accuracy_scores = [] f1_averages = [] accuracy_averages = [] original_fn = file_name x_train, y_train, x_test, y_test, x_dev, y_dev = split_data.splitData( vectors, labels[l], data_type) if development: x_test = x_dev y_test = y_dev model = model_builder() if get_scores: test_pred = model.predict(x_train).transpose() print(test_pred) highest_vals = [0.5] * len(test_pred) # Default 0.5 y_pred = model.predict(x_test).transpose() y_test = np.asarray(y_test).transpose() for y in range(len(y_pred)): y_pred[y][y_pred[y] >= highest_vals[y]] = 1 y_pred[y][y_pred[y] < highest_vals[y]] = 0 f1_array = [] accuracy_array = [] for y in range(len(y_pred)): accuracy_array.append(accuracy_score(y_test[y], y_pred[y])) f1_array.append( f1_score(y_test[y], y_pred[y], average="binary")) print(f1_array[y]) y_pred = y_pred.transpose() y_test = np.asarray(y_test).transpose() micro_average = f1_score(y_test, y_pred, average="micro") cv_f1_fn = loc + data_type + "/nnet/scores/F1 " + file_name + ".txt" cv_acc_fn = loc + data_type + "/nnet/scores/ACC " + file_name + ".txt" dt.write1dArray(f1_array, cv_f1_fn) dt.write1dArray(accuracy_array, cv_acc_fn) f1_scores.append(f1_array) accuracy_scores.append(accuracy_array) f1_average = np.average(f1_array) accuracy_average = np.average(accuracy_array) f1_averages.append(f1_average) accuracy_averages.append(accuracy_average) print("Average F1 Binary", f1_average, "Acc", accuracy_average) print("Micro Average F1", micro_average) f1_array.append(f1_average) f1_array.append(micro_average) accuracy_array.append(accuracy_average) accuracy_array.append(0.0) scores = [accuracy_array, f1_array] csv_fn = loc + data_type + "/nnet/csv/" + csv_fn + ".csv" file_names = [file_name + "ACC", file_name + "F1"] label_names = dt.import1dArray(label_names_fn) if dt.fileExists(csv_fn): print("File exists, writing to csv") try: dt.write_to_csv(csv_fn, file_names, scores) except PermissionError: print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE") dt.write_to_csv( csv_fn[:len(csv_fn) - 4] + str(random.random()) + "FAIL.csv", [file_name], scores) else: print("File does not exist, recreating csv") key = [] for l in label_names: key.append(l) key.append("AVERAGE") key.append("MICRO AVERAGE") dt.write_csv(csv_fn, file_names, scores, key) if save_outputs: if limit_entities is False: self.output_clusters = model.predict(nnet_vectors) else: self.output_clusters = model.predict(entity_vectors) self.output_clusters = self.output_clusters.transpose() dt.write2dArray(self.output_clusters, rank_fn) for l in range(0, len(model.layers) - 1): if dropout_noise is not None and dropout_noise > 0.0: if l % 2 == 1: continue print("Writing", l, "layer") truncated_model = Sequential() for a in range(l + 1): truncated_model.add(model.layers[a]) truncated_model.compile(loss=self.loss, optimizer="sgd") if get_nnet_vectors_path is not None: self.end_space = truncated_model.predict(nnet_vectors) else: self.end_space = truncated_model.predict(entity_vectors) total_file_name = loc + data_type + "/nnet/spaces/" + file_name dt.write2dArray(self.end_space, total_file_name + "L" + str(l) + ".txt") for l in range(len(model.layers)): try: dt.write2dArray( model.layers[l].get_weights()[0], loc + data_type + "/nnet/weights/" + file_name + "L" + str(l) + ".txt") dt.write1dArray( model.layers[l].get_weights()[1], loc + data_type + "/nnet/bias/" + file_name + "L" + str(l) + ".txt") except IndexError: print("Layer ", str(l), "Failed")
def __init__(self, vector_path, class_path, property_names_fn, file_name, svm_type, training_size=10000, lowest_count=200, highest_count=21470000, get_kappa=True, get_f1=True, single_class=True, data_type="movies", getting_directions=True, threads=1, chunk_amt=0, chunk_id=0, rewrite_files=False, classification="all", loc="../data/"): self.get_kappa = True self.get_f1 = get_f1 self.data_type = data_type self.classification = classification self.lowest_amt = lowest_count self.higher_amt = highest_count if chunk_amt > 0: file_name = file_name + " CID" + str(chunk_id) + " CAMT" + str( chunk_amt) directions_fn = loc + data_type + "/svm/directions/" + file_name + ".txt" ktau_scores_fn = loc + data_type + "/svm/f1/" + file_name + ".txt" kappa_fn = loc + data_type + "/svm/kappa/" + file_name + ".txt" acc_fn = loc + data_type + "/svm/acc/" + file_name + ".txt" all_fns = [directions_fn, kappa_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", "getSVMResults") return else: print("Running task", "getSVMResults") y_train = 0 y_test = 0 vectors = np.asarray(dt.import2dArray(vector_path)) print("imported vectors") if not getting_directions: classes = np.asarray(dt.import2dArray(class_path)) print("imported classes") property_names = dt.import1dArray(property_names_fn) print("imported propery names") if chunk_amt > 0: if chunk_id == chunk_amt - 1: chunk = int(len(property_names) / chunk_amt) multiply = chunk_amt - 1 property_names = property_names[chunk * multiply:] else: property_names = dt.chunks( property_names, int( (len(property_names) / chunk_amt)))[chunk_id] if not getting_directions: x_train, x_test, y_train, y_test = train_test_split(vectors, classes, test_size=0.3, random_state=0) else: x_train = vectors x_test = vectors if get_f1: y_train = y_train.transpose() y_test = y_test.transpose() print("transpoosed") self.x_train = x_train self.x_test = x_test self.y_train = y_train self.y_test = y_test if self.get_f1 is False: print("running svms") kappa_scores, directions, ktau_scores, property_names = self.runAllSVMs( y_test, y_train, property_names, file_name, svm_type, getting_directions, threads) dt.write1dArray(kappa_scores, kappa_fn) dt.write2dArray(directions, directions_fn) dt.write1dArray(ktau_scores, ktau_scores_fn) dt.write1dArray(property_names, property_names_fn + file_name + ".txt") else: final_f1 = [] final_acc = [] for y in range(len(y_train)): f1, acc = self.runClassifySVM(y_test[y], y_train[y]) print(f1, acc) final_f1.append(f1) final_acc.append(acc) dt.write1dArray(final_f1, ktau_scores_fn) dt.write1dArray(final_acc, acc_fn)
def __init__(self, features_fn, classes_fn, class_names_fn, cluster_names_fn, filename, training_data, max_depth=None, balance=None, criterion="entropy", save_details=False, data_type="movies", cv_splits=5, csv_fn="../data/temp/no_csv_provided.csv", rewrite_files=False, split_to_use=-1, development=False, limit_entities=False, limited_label_fn=None, vector_names_fn=None, clusters_fn="", cluster_duplicates=False, save_results_so_far=False, multi_label=False): label_names = dt.import1dArray(class_names_fn) filename = filename + str(max_depth) all_fns = [] file_names = ['ACC ' + filename, 'F1 ' + filename] acc_fn = '../data/' + data_type + '/rules/tree_scores/' + file_names[ 0] + '.scores' prediction_fn = '../data/' + data_type + '/rules/tree_output/' + filename + '.scores' f1_fn = '../data/' + data_type + '/rules/tree_scores/' + file_names[ 1] + '.scores' all_top_names_fn = "../data/" + data_type + "/rules/names/" + filename + ".txt" all_top_rankings_fn = "../data/" + data_type + "/rules/rankings/" + filename + ".txt" all_top_clusters_fn = "../data/" + data_type + "/rules/clusters/" + filename + ".txt" fns_name = "../data/" + data_type + "/rules/names/" + filename + label_names[ 0] + ".txt" features_name = "../data/" + data_type + "/rules/rankings/" + filename + label_names[ 0] + ".txt" dt_clusters_name = "../data/" + data_type + "/rules/clusters/" + filename + label_names[ 0] + ".txt" if save_details is False: all_fns = [acc_fn, f1_fn, prediction_fn, csv_fn] else: new_graph_png_fn = '../data/' + data_type + '/rules/tree_images/' + label_names[ 0] + " " + filename + '.png' all_fns = [acc_fn, f1_fn, prediction_fn, csv_fn] if max_depth is not None: all_fns.append(all_top_names_fn) all_fns.append(all_top_rankings_fn) all_fns.append(all_top_clusters_fn) if save_details: orig_dot_file_fn = '../data/' + data_type + '/rules/tree_data/' + label_names[ 0] + " " + filename + 'orig.txt' # all_fns.append(orig_dot_file_fn) model_name_fn = "../data/" + data_type + "/rules/tree_model/" + label_names[ 0] + " " + filename + ".model" #all_fns.append(model_name_fn) if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", "DecisionTree") return else: print("Running task", "DecisionTree") vectors = np.asarray(dt.import2dArray(features_fn)) if data_type == "sentiment": # If it's just a binary class... labels = np.asarray(dt.import1dArray(classes_fn, "i")) else: labels = np.asarray(dt.import2dArray(classes_fn, "i")) print("vectors", len(vectors), len(vectors[0])) print("labels", len(labels), len(labels[0])) if data_type == "sentiment" or len(vectors) != len(labels[0]): vectors = vectors.transpose() print("vectors", len(vectors), len(vectors[0])) cluster_names = dt.import2dArray(cluster_names_fn, "s") clusters = dt.import2dArray(clusters_fn, "f") original_vectors = vectors if "ratings" in classes_fn: orig_path = "/".join(classes_fn.split("/")[:-1]) + "/" match_ids_fn = orig_path + "matched_ids.txt" if os.path.exists(match_ids_fn): matched_ids = dt.import1dArray(match_ids_fn, "i") else: vector_names = dt.import1dArray(vector_names_fn) limited_labels = dt.import1dArray(limited_label_fn) matched_ids = dt.match_entities(vector_names, limited_labels) dt.write1dArray(matched_ids, match_ids_fn) vectors = vectors[matched_ids] print("vectors", len(vectors)) print("Past limit entities") for l in range(len(label_names)): if label_names[l][:6] == "class-": label_names[l] = label_names[l][6:] f1_array = [] accuracy_array = [] prec_array = [] recall_array = [] if not multi_label and data_type != "sentiment": labels = labels.transpose() print("labels transposed") print("labels", len(labels), len(labels[0])) else: labels = [labels] all_top_clusters = [] all_top_rankings = [] all_top_names = [] all_top_inds = [] all_y_test = [] all_predictions = [] print("At label prediction") for l in range(len(labels)): # Select training data with cross validationac_y_test = [] cv_acc = [] cv_prec = [] cv_recall = [] c = 0 # If doing cross-validation if cv_splits > 1: ac_x_train, ac_y_train, ac_x_test, ac_y_test, ac_x_dev, ac_y_dev = split_data.crossValData( cv_splits, vectors, labels[l]) else: x_train, y_train, x_test, y_test, x_dev, y_dev = split_data.splitData( vectors, labels[l], data_type) ac_y_train = [x_train] ac_x_train = [y_train] ac_x_test = [x_test] ac_y_test = [y_test] ac_y_dev = [x_dev] ac_x_dev = [y_dev] if development: ac_x_test = ac_x_dev ac_y_test = ac_y_dev for splits in range(len(ac_y_test)): model_name_fn = "../data/" + data_type + "/rules/tree_model/" + label_names[ l] + " " + filename + ".model" """ if dt.fileExists(model_name_fn) and not rewrite_files: try: clf = joblib.load(model_name_fn) except KeyError: print(model_name_fn) # If a model is disrupted partway through its processing else: """ clf = tree.DecisionTreeClassifier(max_depth=max_depth, criterion=criterion, class_weight=balance) clf.fit(ac_x_train[splits], ac_y_train[splits]) joblib.dump(clf, model_name_fn) predictions.append(clf.predict(ac_x_test[splits])) ac_y_test = list(ac_y_test) predictions = list(predictions) for i in range(len(predictions)): print(scores) class_names = ["NOT " + label_names[l], label_names[l]] # Export a tree for each label predicted by the clf if save_details: orig_dot_file_fn = '../data/' + data_type + '/rules/tree_data/' + label_names[ l] + " " + filename + 'orig.txt' new_dot_file_fn = '../data/' + data_type + '/rules/tree_data/' + label_names[ l] + " " + filename + '.txt' orig_graph_png_fn = '../data/' + data_type + '/rules/tree_images/' + label_names[ l] + " " + filename + 'orig.png' new_graph_png_fn = '../data/' + data_type + '/rules/tree_images/' + label_names[ l] + " " + filename + '.png' orig_temp_graph_png_fn = '../data/' + data_type + '/rules/tree_temp/' + label_names[ l] + " " + filename + 'orig.png' new_temp_graph_png_fn = '../data/' + data_type + '/rules/tree_temp/' + label_names[ l] + " " + filename + '.png' output_names = [] for c in cluster_names: line = "" counter = 0 for i in range(len(c)): line = line + c[i] + " " counter += 1 if counter == 8: break output_names.append(line) failed = False try: tree.export_graphviz( clf, feature_names=output_names, class_names=class_names, out_file=orig_dot_file_fn, max_depth=max_depth, label='all', filled=True, impurity=True, node_ids=True, proportion=True, rounded=True, ) except FileNotFoundError: try: orig_dot_file_fn = "//?/" + orig_dot_file_fn tree.export_graphviz(clf, feature_names=output_names, class_names=class_names, out_file=orig_dot_file_fn, max_depth=max_depth, label='all', filled=True, impurity=True, node_ids=True, proportion=True, rounded=True) except FileNotFoundError: failed = True print("doesnt work fam") if failed == False: rewrite_dot_file = dt.import1dArray(orig_dot_file_fn) new_dot_file = [] max = 3 min = -3 """ for f in original_vectors: for n in f: if n > max: max = n if n < min: min = n """ print(max) print(min) boundary = max - min boundary = boundary / 5 bound_1 = 0 - boundary * 2 bound_2 = 0 - boundary * 1 bound_3 = 0 bound_4 = 0 + boundary bound_5 = 0 + boundary * 2 for s in rewrite_dot_file: if ":" in s: s = s.split("<=") no_num = s[0] num = s[1] num = num.split() end = " ".join(num[:-1]) num_split = num[0].split("\\") num = num_split[0] end = end[len(num):] num = float(num) replacement = "" if num <= bound_2: replacement = "VERY LOW" elif num <= bound_3: replacement = "VERY LOW - LOW" elif num <= bound_4: replacement = "VERY LOW - AVERAGE" elif num <= bound_5: replacement = "VERY LOW - HIGH" elif num >= bound_5: replacement = "VERY HIGH" new_string_a = [no_num, replacement, end] new_string = " ".join(new_string_a) new_dot_file.append(new_string) if "]" in new_string: if '"' not in new_string[len(new_string) - 10:]: for c in range(len(new_string)): if new_string[c + 1] == "]": new_string = new_string[: c] + '"' + new_string[ c:] break else: new_dot_file.append(s) """ new_string = s if "->" not in s and "digraph" not in s and "node" not in s and "(...)" not in s and "}" not in s: index = s.index("value") new_string = s[:index] + '"] ;' new_dot_file.append(new_string) """ #new_dot_file.append(s) dt.write1dArray(new_dot_file, new_dot_file_fn) try: orig_graph = pydot.graph_from_dot_file( orig_dot_file_fn) new_graph = pydot.graph_from_dot_file( new_dot_file_fn) orig_graph.write_png(orig_graph_png_fn) new_graph.write_png(new_graph_png_fn) orig_graph.write_png(orig_temp_graph_png_fn) new_graph.write_png(new_temp_graph_png_fn) except FileNotFoundError: orig_graph_png_fn = "//?/" + orig_graph_png_fn try: orig_graph.write_png(orig_graph_png_fn) new_graph_png_fn = "//?/" + new_graph_png_fn new_graph.write_png(new_graph_png_fn) except FileNotFoundError: print("failed graph") self.get_code(clf, output_names, class_names, label_names[l] + " " + filename, data_type) dt_clusters, features, fns, inds = self.getNodesToDepth( clf, original_vectors, cluster_names, clusters) print(filename + label_names[l]) fns_name = "../data/" + data_type + "/rules/names/" + filename + label_names[ l] + ".txt" features_name = "../data/" + data_type + "/rules/rankings/" + filename + label_names[ l] + ".txt" dt_clusters_name = "../data/" + data_type + "/rules/clusters/" + filename + label_names[ l] + ".txt" dt.write2dArray(fns, fns_name) dt.write2dArray(features, features_name) dt.write2dArray(dt_clusters, dt_clusters_name) all_top_rankings.extend(features) all_top_clusters.extend(dt_clusters) all_top_names.extend(fns) all_top_inds.extend(inds) print("len clusters", len(all_top_clusters)) print("len rankings", len(all_top_rankings)) print("len names", len(all_top_names)) if len(all_top_clusters) != len(all_top_rankings) or len( all_top_clusters) != len(all_top_names): print("stop") accuracy_array = np.asarray(accuracy_array) accuracy_average = np.average(accuracy_array) prec_array = np.asarray(prec_array) average_prec = np.average(prec_array) recall_array = np.asarray(recall_array) average_recall = np.average(recall_array) f1_average = 2 * ((average_prec * average_recall) / (average_prec + average_recall)) if math.isnan(f1_average): print("NAN", prec, recall) f1_average = 0.0 all_y_test = np.asarray(all_y_test) all_predictions = np.asarray(all_predictions) micro_average = f1_score(all_y_test, all_predictions, average="micro") accuracy_array = accuracy_array.tolist() accuracy_array.append(accuracy_average) accuracy_array.append(0.0) f1_array.append(f1_average) f1_array.append(micro_average) scores = [accuracy_array, f1_array] dt.write1dArray(accuracy_array, acc_fn) dt.write1dArray(f1_array, f1_fn) dt.write2dArray(all_predictions, prediction_fn) if dt.fileExists(csv_fn): print("File exists, writing to csv") try: dt.write_to_csv(csv_fn, file_names, scores) except PermissionError: print("CSV FILE WAS OPEN, SKIPPING") except ValueError: print("File does not exist, recreating csv") key = [] for l in label_names: key.append(l) key.append("AVERAGE") key.append("MICRO AVERAGE") dt.write_csv(csv_fn, file_names, scores, key) else: print("File does not exist, recreating csv") key = [] for l in label_names: key.append(l) key.append("AVERAGE") key.append("MICRO AVERAGE") dt.write_csv(csv_fn, file_names, scores, key) if max_depth is not None: all_top_names = np.asarray(all_top_names) all_top_rankings = np.asarray(all_top_rankings) all_top_clusters = np.asarray(all_top_clusters) all_top_inds = np.asarray(all_top_inds) if cluster_duplicates: ind_to_keep = np.unique(all_top_inds, return_index=True)[1] all_top_names = all_top_names[ind_to_keep] all_top_rankings = all_top_rankings[ind_to_keep] all_top_clusters = all_top_clusters[ind_to_keep] dt.write2dArray(all_top_names, all_top_names_fn) dt.write2dArray(all_top_rankings, all_top_rankings_fn) dt.write2dArray(all_top_clusters, all_top_clusters_fn)