def main(): # conversations = [("B", "A"), ("B", "M"), ("C", "E"), ("C", "F"), ("D", "G"), ("D", "L"), ("E", "I")] # conversations = [("E", "J"), ("F", "E"), ("F", "J"), ("G", "B"), ("G", "L")] conversations = [[]] for convo in conversations: convo_reduction_start = time.time() # names= data_handling.read_names(["case_study"]) print("Conversation: ", convo) names = data_handling.read_names(["geco", data_handling.get_conversation_directory(convo)]) samples, targets, zero_length_indices = data_handling.dataset(names) names = np.delete(names, zero_length_indices) print("original shapes, samples: {} \ttargets: {} \tnames: {}".format(np.shape(samples), np.shape(targets), np.shape(names))) samples = preprocessing.scale_select(samples) # data = np.hstack((samples, np.reshape(targets, (len(targets), 1)))) # print("Data shape: {}".format(np.shape(data))) # unique, counts = np.unique(targets, return_counts = True) # print("Class breakdown: {}".format(dict(zip(unique, counts)))) # data = preprocessing.balance_classes(data) # print("Balanced data shape: {}".format(np.shape(data))) # samples = data[:, :-1] # targets = data[:, -1] if(True): perplexities = [30] learning_rates = np.arange(300, 700, 100) # learning_rates = [200] exaggerations = [72, 84, 96] for perplexity in perplexities: for exaggeration in exaggerations: for l_r in learning_rates: print("p: {}, lr:{}, exag: {}".format(perplexity, l_r, exaggeration)) start = time.time() # tsne = manifold.TSNE(n_components = 2, init='pca', perplexity=perplexity, early_exaggeration = exaggeration, # learning_rate = l_r, n_iter = 1000, # random_state=data_handling.RANDOM_SEED, verbose = 1) # reduced_samples = tsne.fit_transform(samples) reduced_samples, tsne = preprocessing.tsne_reduction(samples, perplexity, l_r = l_r, ex = exaggeration, iterations=1000, verbosity=1) end = time.time() log_entry = { "perplexity" : perplexity, "learning_rate" : int(l_r), "early_exaggeration" : int(exaggeration), "final_error" : str(tsne.kl_divergence_) } logger.store_log_entry(log_entry, "tSNE_viz_log.json") print("reduced samples shape: ", np.shape(reduced_samples)) print("Reduction took: {}\n".format(end-start)) # cluster_scatter_2D(reduced_samples, targets, tsne) scatter_2D(reduced_samples, tsne) print() convo_reduction_end = time.time() print("Conversation explored in: {}\n".format(convo_reduction_end-convo_reduction_start)) if(False): tsne = manifold.TSNE(n_components = 2, init='pca', perplexity=5, early_exaggeration = 36, learning_rate = 800, n_iter = 5000, random_state=data_handling.RANDOM_SEED, verbose = 1) reduced_samples = tsne.fit_transform(samples) log_entry = { "perplexity" : perplexity, "learning_rate" : l_r, "early_exaggeration" : exaggeration, "final_error" : tsne.kl_divergence_ } logger.store_log_entry(log_entry, "tSNE_viz_log.json") print("reduced samples shape: ", np.shape(reduced_samples)) cluster_scatter_2D(reduced_samples, targets, tsne) print()
def main(): names = data_handling.read_names(["keynote"]) samples, targets, zero_length_indices = data_handling.dataset(names) names = np.delete(names, zero_length_indices) # Samples Preprocessing samples = data_handling.scale_select(samples) # Class balancing # data = data_handling.balance_classes(data) # print("Balanced data shape: {}".format(np.shape(data))) # samples = data[:, :-1] # targets = data[:, -1] # unique, counts = np.unique(targets, return_counts=True) # print("Class breakdown: {}\n".format(dict(zip(unique, counts)))) #output_file # data_handling.output_files_labels_case_study(names, targets, data_root2) unique, counts = np.unique(targets, return_counts=True) print("The case study data are broken into classes like so: {}".format( dict(zip(unique, counts)))) #### CROSS validation print("Perplexity: {}".format(PERPLEXITY)) #Reduction #reduced_samples = data_handling.tsne_reduction(samples, PERPLEXITY, l_r=LEARNING_RATE) #print("Reduced samples shape: {}".format(np.shape(reduced_samples))) ######## data = np.hstack((samples, np.reshape(targets, (len(targets), 1)))) #data = np.hstack((reduced_samples, np.reshape(targets, (len(targets), 1)))) #data = data_handling.balance_classes(data) clf = SVC(C=1, class_weight='balanced', verbose=0, probability=True) #cv = ShuffleSplit(n_splits=1, test_size=0.8, random_state=0) score1 = cross_val_score(clf, data[:, :-1], data[:, -1], cv=5, scoring='accuracy') # score1 = cross_val_score(clf, reduced_samples, targets, cv=5, scoring='accuracy') print("Cross validation (accuracy) score") print(score1) print("average:") print(np.average(score1)) print("std:") print(np.std(score1)) score2 = cross_val_score(clf, data[:, :-1], data[:, -1], cv=5, scoring='precision_macro') #score2 = cross_val_score(clf, reduced_samples, targets, cv=5, scoring='precision_macro') print("Cross validation (precision) score") print(score2) print("average:") print(np.average(score2)) print("std:") print(np.std(score2)) score3 = cross_val_score(clf, data[:, :-1], data[:, -1], cv=5, scoring='recall_macro') # score3 = cross_val_score(clf, reduced_samples, targets, cv=5, scoring='recall_macro') print("Cross validation (recall) score") print(score3) print("average:") print(np.average(score3)) print("std:") print(np.std(score3)) # Split in train and test train, test, train_targets, test_targets = train_test_split( samples, targets, test_size=0.1, random_state=1337) #train, test, train_targets, test_targets = train_test_split(reduced_samples, targets, test_size=0.1, random_state=1337) print("shape train: {}".format(np.shape(train))) print("shape test: {}".format(np.shape(test))) clf = SVC(C=1, class_weight='balanced', verbose=0, probability=True) #clf.fit(train[:, :-1], train[:, -1]) clf.fit(train, train_targets) # #### Try on Keynote######### print("Train confusion matrix=") predictions_train = clf.predict(train) cnf_train = confusion_matrix(train_targets, predictions_train, labels=["silences", "breathing", "clicks"]) fig = plt.figure() ax1 = fig.add_subplot(121) plot_confusion_matrix(cnf_train, classes=["silences", "breathing", "clicks"], normalize=True, title='Train confusion matrix normalized') print("Test confusion matrix=") #predictions_test= clf.predict(test[:, :-1]) predictions_test = clf.predict(test) cnf_test = confusion_matrix(test_targets, predictions_test, labels=["silences", "breathing", "clicks"]) ax2 = fig.add_subplot(122) plot_confusion_matrix(cnf_test, classes=["silences", "breathing", "clicks"], normalize=True, title='Test confusion matrix normalized') # print("confusion matrix=") # predictions = clf.predict(data[:, :-1]) # cnf = confusion_matrix(data[:, -1], predictions, labels=["silences", "breathing", "clicks"]) # #fig = plt.figure() # ax3 = fig.add_subplot(122) # plot_confusion_matrix(cnf, classes=["silences", "breathing", "clicks"], normalize=True, # title=' confusion matrix normalized') plt.show()
def main(): conversation = ["B", "A"] # conversation = [] names = data_handling.read_names(["geco", data_handling.get_conversation_directory(conversation)]) samples, _, zero_length_indices = data_handling.dataset(names) names = np.delete(names, zero_length_indices) speakers = data_handling.get_speakers(names) print("Original shapes, samples: {} \t names : {} \t speakers : {}".format(np.shape(samples), np.shape(names), np.shape(speakers))) # Samples Preprocessing samples = preprocessing.scale_select(samples) print("After feature selection shapes, samples: {}".format(np.shape(samples))) # ########### # Reduction print("Reduction") reduced_samples, tsne_params = tSNE_IO.load_or_reduce(samples, PERPLEXITY, LEARNING_RATE, EXAGGERATION, visually_best=VISUALLY_BEST) print("After dimensionality reduction shapes, samples: {}".format(np.shape(reduced_samples))) # ######## if(True): # KMeans fit print() predictions = fit_kmeans(reduced_samples) plotter.plot_clusters(reduced_samples, predictions, "KMeans GECO unsupervised", speakers, VISUALLY_BEST) results_IO.clusters_to_file(names, reduced_samples, predictions, "KMeans", VISUALLY_BEST) # AgglomerativeClustering print() predictions = fit_AgglomerativeClustering(reduced_samples) plotter.plot_clusters(reduced_samples, predictions, "Agglomerative Clustering GECO unsupervised", speakers, VISUALLY_BEST) results_IO.clusters_to_file(names, reduced_samples, predictions, "Agglomerative_Clustering", VISUALLY_BEST) # DBSCAN fit # 3.1 35 print() EPS = 2.8 MIN_SAMPLES = 50 predictions = fit_DBSCAN(reduced_samples, eps = EPS, min_samples = MIN_SAMPLES) unique, counts = np.unique(predictions, return_counts = True) print("The predictions are broken down as: {}".format(dict(zip(unique, counts)))) create_log_entry(conversation, EPS, MIN_SAMPLES, tsne_params) plotter.plot_clusters(reduced_samples, predictions, "DBSCAN GECO unsupervised", speakers, VISUALLY_BEST) results_IO.clusters_to_file(names, reduced_samples, predictions, "DBSCAN", VISUALLY_BEST) # DBSCAN exploration if(False): print() for min_samples in np.arange(45, 60, 3): for eps in np.arange(2.6, 3.5, .05): print("mins_samples : {}, eps: {}".format(min_samples, eps)) dbscan = DBSCAN(eps = eps, min_samples = min_samples) dbscan.fit(reduced_samples) predictions = dbscan.labels_ print("Predictions shape: {}".format(np.shape(predictions))) unique, counts = np.unique(predictions, return_counts = True) print("The predictions are broken down as: {}".format(dict(zip(unique, counts)))) # plotter.plot_clusters(reduced_samples, predictions, "DBSCAN Keynote unsupervised", type = "predicted") for i, prediction in enumerate(predictions): if prediction == -1: predictions[i] = unique[-1] + 1 clustered = [[] for _ in unique] for i in range(len(reduced_samples)): clustered[int(predictions[i])].append(reduced_samples[i]) # colors = [plt.cm.Spectral(c) for c in unique] labels = ["label_{}".format(i+1) for i in unique] for i, cluster in enumerate(clustered): cluster = np.array(cluster) plt.scatter(cluster[:, 0], cluster[:, 1], label = labels[i], cmap = "RdBu", marker = ".", linewidths = 0) plt.legend() plt.title("DBSCAN clustering for eps:{}, min_samples:{}".format(eps, min_samples)) plt.show() print()
def main(): names = data_handling.read_names(["keynote"]) samples, targets, zero_length_indices = data_handling.dataset(names) names = np.delete(names, zero_length_indices) print("Original shapes, samples: {} \t targets : {} \t names: {}".format( np.shape(samples), np.shape(targets), np.shape(names))) unique, counts = np.unique(targets, return_counts=True) print("Class breakdown: {}".format(dict(zip(unique, counts)))) # Samples Preprocessing samples = preprocessing.scale_select(samples) print("Feature selection shapes, samples: {} \t targets : {}\n".format( np.shape(samples), np.shape(targets))) # ########### # Dimensionality reduction and classification exploration if (False): dimensionalities = [100, 150, 170, 200] perplexities = [35, 40, 45, 50] learning_rates = [500, 800] for dimensionality in dimensionalities: for perplexity in perplexities: for l_r in learning_rates: print( "Dimensionality: {} \t Perplexity: {} \t Learning rate : {}" .format(dimensionality, perplexity, l_r)) # Reduction print("Reduction") reduced_samples = tSNE_IO.load_or_reduce(samples) print("Reduced samples shape: {}".format( np.shape(reduced_samples))) # ######## # Class balancing data = np.hstack( (reduced_samples, np.reshape(targets, (len(targets), 1)))) data, names = preprocessing.balance_classes( data, names, np.shape(reduced_samples)[1]) print("Balanced data shape: {}".format(np.shape(data))) reduced_samples = data[:, :-1] targets = data[:, -1] unique, counts = np.unique(targets, return_counts=True) print("Class breakdown: {}\n".format( dict(zip(unique, counts)))) # ########### # Splitting # Split in train and test train, test = preprocessing.split(reduced_samples, targets, 0.15) # ######### for c in np.arange(.1, .9, .1): # SVM fitting clf = SVC(C=c, class_weight='balanced', verbose=0, probability=True) clf.fit(train[:, :-1], train[:, -1]) score = clf.score(test[:, :-1], test[:, -1]) print( "SVC score of fit on case study data: {}, with C: {}" .format(score, clf.get_params()["C"])) parameters = { "algo": "SVC", "kernel": clf.get_params()["kernel"], "dataset": "keynote", "tsne_perplexity": perplexity, "tsne_learning_rate": l_r, "tnse_dimensionality": dimensionality, "svc_c": c, "score": score, } logger.store_log_entry( parameters, "keynote_supervised_exploration_log.json") print() # Assume dimensionality reduction, classification exploration if (True): # Reduction print("Reduction") reduced_samples = tSNE_IO.load_or_reduce(samples, PERPLEXITY, LEARNING_RATE, EXAGGERATION) print("Reduced samples shape: {}".format(np.shape(reduced_samples))) # ######## # Class balancing data = np.hstack( (reduced_samples, np.reshape(targets, (len(targets), 1)))) data, names = preprocessing.balance_classes( data, names, np.shape(reduced_samples)[1]) print("Balanced data shape: {}".format(np.shape(data))) reduced_samples = data[:, :-1] targets = data[:, -1] unique, counts = np.unique(targets, return_counts=True) print("Class breakdown: {}\n".format(dict(zip(unique, counts)))) # ########### # Split in train and test train, test = preprocessing.split(reduced_samples, targets, 0.15) # ######### for c in np.arange(.1, 1, .1): # SVM fitting clf = SVC(C=c, class_weight='balanced', verbose=0, probability=True) clf.fit(train[:, :-1], train[:, -1]) score = clf.score(test[:, :-1], test[:, -1]) print("SVC score of fit on case study data: {}, with C: {}".format( score, clf.get_params()["C"])) parameters = { "algo": "SVC", "kernel": clf.get_params()["kernel"], "dataset": "keynote", "tsne_perplexity": PERPLEXITY, "tsne_learning_rate": LEARNING_RATE, "tnse_dimensionality": 2, "svc_c": c, "score": score, } logger.store_log_entry(parameters, "keynote_supervised_exploration_log.json")
def main(): names = data_handling.read_names(["keynote"]) samples, targets, zero_length_indices = data_handling.dataset(names) names = np.delete(names, zero_length_indices) print("Original shapes, samples: {} \t targets : {} \t names: {}".format(np.shape(samples), np.shape(targets), np.shape(names))) unique, counts = np.unique(targets, return_counts = True) print("Class breakdown: {}".format(dict(zip(unique, counts)))) # Samples Preprocessing samples = preprocessing.scale_select(samples) print("Feature selection shapes, samples: {} \t targets : {}".format(np.shape(samples), np.shape(targets))) # ########### # tSNE Reduction print("Reduction") reduced_samples = tSNE_IO.load_or_reduce(samples, PERPLEXITY, LEARNING_RATE, EXAGGERATION) print("Reduced samples shape: {}".format(np.shape(reduced_samples))) # ######## plot_clusters(reduced_samples, targets, "Keynote unbalanced actual", type = "actual") # Class balancing data = np.hstack((reduced_samples, np.reshape(targets, (len(targets), 1)))) data, names = preprocessing.balance_classes(data, names, np.shape(reduced_samples)[1]) print("Balanced data shape: {}".format(np.shape(data))) reduced_samples = data[:, :-1] targets = data[:, -1] # ########### plot_clusters(reduced_samples, targets, "Keynote balanced actual", type = "actual") # KMeans fit if(False): kmeans = KMeans(n_clusters = 3) print("Kmeans fitting") kmeans.fit(reduced_samples) predictions = kmeans.labels_ print("Predictions shape: {}".format(np.shape(predictions))) plot_clusters(reduced_samples, predictions, "KMeans Keynote unsupervised", type = "predicted") # AgglomerativeClustering if(False): print() aggro = AgglomerativeClustering(n_clusters = 3) print("AgglomerativeClustering fitting") aggro.fit(reduced_samples) predictions = aggro.labels_ plot_clusters(reduced_samples, predictions, "Agglomerative Clustering Keynote unsupervised", type = "predicted") # DBSCAN exploration if(False): print() for min_samples in np.arange(20, 50, 5): for eps in np.arange(4, 10, .5): print("mins_samples : {}, eps: {}".format(min_samples, eps)) dbscan = DBSCAN(eps = eps, min_samples = min_samples) print("DBSCAN fitting") dbscan.fit(reduced_samples) predictions = dbscan.labels_ print("Predictions shape: {}".format(np.shape(predictions))) unique, counts = np.unique(predictions, return_counts = True) print("The predictions are broken down as: {}".format(dict(zip(unique, counts)))) # plot_clusters(reduced_samples, predictions, "DBSCAN Keynote unsupervised", type = "predicted") for i, prediction in enumerate(predictions): if prediction == -1: predictions[i] = unique[-1] + 1 clustered = [[] for _ in unique] for i in range(len(reduced_samples)): clustered[int(predictions[i])].append(reduced_samples[i]) # colors = [plt.cm.Spectral(c) for c in unique] labels = ["label_{}".format(i+1) for i in unique] for i, cluster in enumerate(clustered): cluster = np.array(cluster) plt.scatter(cluster[:, 0], cluster[:, 1], label = labels[i], cmap = "RdBu") plt.legend() plt.title("DBSCAN clustering for eps:{}, min_samples:{}".format(eps, min_samples)) plt.show() print() # DBSCAN fit if (True): print() dbscan = DBSCAN(eps = 3.5, min_samples = 60) print("DBSCAN fitting") dbscan.fit(reduced_samples) predictions = dbscan.labels_ print("Predictions shape: {}".format(np.shape(predictions))) unique, counts = np.unique(predictions, return_counts = True) print("The predictions are broken down as: {}".format(dict(zip(unique, counts)))) for i, prediction in enumerate(predictions): if prediction == -1: predictions[i] = unique[-1] + 1 plot_clusters(reduced_samples, predictions, "DBSCAN Keynote unsupervised", type = "predicted") extract_to_file(names, reduced_samples, predictions)