def main(argv): # wandb.init(project="WME-Nyst and CUR") # wandb.config.update(flags.FLAGS) # logging.info('Running with args %s', str(argv)) # get dataset dataset = FLAGS.dataset if dataset == "ohsumed": filename = "oshumed_K_set1.mat" if FLAGS.run_mode == "test": test_filename = "oshumed_K_set1.mat" version = "v7.3" if dataset == "twitter": filename = "twitter_K_set1.mat" if FLAGS.run_mode == "test": test_filename = "twitter_K_set1.mat" version = "default" if dataset == "news": filename = "20ng2_new_K_set1.mat" if FLAGS.run_mode == "test": test_filename = "20ng2_new_K_set1.mat" version = "v7.3" if dataset == "recipe": filename = "recipe_trainData.mat" if FLAGS.run_mode == "test": test_filename = "recipe_K_set1" version = "v7.3" approximator = FLAGS.method if approximator not in ["nystrom", "CUR"]: print("please choose between nystrom and CUR for approximator") return None # get EMD matrix # similarity_matrix, labels = read_mat_file(\ # file_="/mnt/nfs/work1/elm/ray/"+filename,\ # version=version, return_type="all") similarity_matrix, labels = read_mat_file(\ file_="./WordMoversEmbeddings/mat_files/"+filename,\ version=version, return_type="all") # set hyperparameters config = {"samples":FLAGS.sample_size,\ "CV":10, \ "gamma": FLAGS.gamma,\ "lambda_inverse":FLAGS.lambda_inverse,\ "approximator":approximator, "run_mode":FLAGS.run_mode} if config["run_mode"] == "test": test_sim_mat, test_labels = read_mat_file(\ file_="./WordMoversEmbeddings/mat_files/"+test_filename,\ version=version, return_type="all", mode="test") if config["run_mode"] == "validate": train_all(similarity_matrix, labels, config) else: train_all(similarity_matrix, labels, config, \ X_test=test_sim_mat, Y_test=test_labels) return None
def create_spectrogram_images(mat_file_path): data, properties = utils.read_mat_file(mat_file_path) spectrograms = [] for data_channel in data.T: spectrograms.append(utils.convert_to_spectrogram(data_channel)) return np.array(spectrograms)
def get_features(mat_file): d, _ = ut.read_mat_file(mat_file) # 5 seconds = 600/5 = 120 windows window_size = d.shape[0] / 120 for i in range(0, d.shape[0], window_size): wd = d[i:i + window_size, :].T bf_feats = np.array(map(get_butter_features, wd)) ff_feats = np.array(map(get_fft_features, wd)) xcorr_feats = get_corr_features(wd) yield np.append(np.append(bf_feats, ff_feats), xcorr_feats)
distorts = [0, 10, 20, 30, 40] evaluator = StructuredAccuracy() for distort in distorts: print 'data with ' + str(distort) + '% of total labels distorted' ### prepare training, test data and evaluator train_data_file = 'hmsvm_%d_distort_data_fold' % distort train_num_examples_fold = 20 train_num_folds = 5 train_labels, train_features = utils.unfold_data(train_data_file) test_data_file = 'hmsvm_%d_distort_data_test' % distort test_num_examples = 100 test_labels, test_features = utils.read_mat_file(test_data_file, test_num_examples) ### train ML-HMM and evaluate in training data model = HMSVMModel(train_features, train_labels, SMT_TWO_STATE) model.set_use_plifs(True) mlhmm = MLHMM(model) mlhmm.train() prediction = mlhmm.apply() accuracy = evaluator.evaluate(prediction, train_labels) print '\ttraining accuracy:\t' + str(accuracy*100) + '%' utils.print_statistics(train_labels, prediction) ### evaluate in test data
def main(variables): data = read_mat_file(variables["data_path"]) f1_data, f2_data = np.array(data['F1']), np.array(data['F2']) n_samples = f1_data.shape[0] ground_truth = np.array([[0, 1, 2, 3, 4] for _ in range(n_samples)]) print("About the data") print("Source of data: ", variables["data_path"]) print("Classes of data: 0,1,2,3,4") print("No. of samples: ", n_samples, "\n") #Training on 100 samples #m_std is dictionary of f1, f2 for each column, c1 c2 c3 c4 and c5. print("\n---------- Section 1: Training -------------") print("\n Calculating the means and standard deviations for 100 samples\n") train_size = variables['training_size'] b1 = Bayes_Classifier(f1_data, train_size) m_std_train = b1.train() ## Section 2.1: Testing print("\n---------- Section 2.1: Testing -------------") print("\n Predicting the classes for 101: 1000 samples") predicted = b1.predict() ## Section 2.2: Calculating accuracy and error rate print( "\n---------- Section 2.2: Calculating accuracy for the classifier -------------" ) print("\nAccuracy for the Bayes classifier: ") acc = b1.validate(predicted) ## Section 3: Standard Normal (z score) print("---------- Section 3: Standard normal(Z Score) -------------") # z1_data is the standard normalized data. z1_data=np.swapaxes(np.array([std_normalize(f1_data[:,i],m_std_train['f1'][i]['m'],\ m_std_train['f1'][i]['std']) for i in range(5)]),0,1) print("Plot of Z1 vs F2") plot_clustered_graph(z1_data.flatten(), f2_data.flatten(), ground_truth.flatten(), name="z1vsf2.png", labels=['z1', 'f2']) # z1_data is the standard normalized data. print("\n Plot of F1 vs F2") plot_clustered_graph(f1_data.flatten(), f2_data.flatten(), ground_truth.flatten(), name="f1vsf2.png", labels=['f1', 'f2']) ## Section 4 ### Case 1: Training with the z1 data print( "\n---------- Section 4, Case 2: Training with the z1 data -------------" ) b = Bayes_Classifier(z1_data) b.train() predicted = b.predict() acc = b.validate(predicted) print( "\n---------- Section 4, Case 3: Training with the f2 data -------------" ) b = Bayes_Classifier(f2_data) b.train() predicted = b.predict() acc = b.validate(predicted) print( "\n---------- Section 4, Case 4: Training with the [z1, f2] data -------------" ) data = {'z1': z1_data, 'f2': f2_data} b = Multivariate_Bayes_Classifier(data) b.train() predicted = b.predict() acc = b.validate(predicted)
K = 5 # number of examples per fold num_fold_examples = 20 # length of each example example_len = 250 # number of features per example num_features = 10 # the number different label values num_states = 2 # K models that will contain the data of each fold models = [] # load each data fold in a HMSVMModel data_file = 'hmsvm_30_distort_data_fold' for k in xrange(K): labels, features = utils.read_mat_file('%s_%d' % (data_file, k), num_fold_examples) models.append(HMSVMModel(features, labels, SMT_TWO_STATE)) # put together folds, leaving out one of them for each set labels_no_fold = [] features_no_fold = [] # for each fold for k1 in xrange(K): # put all labels/features together except the ones of the current fold labels_no_kfold = SequenceLabels(num_fold_examples*(K-1), num_states) features_no_kfold = RealMatrixFeatures(num_fold_examples*(K-1), num_features) # index for the next feature vector to set idx = 0 for k2 in xrange(K):
def compute_min_eig(Z): min_eigs = np.linalg.eigvals(Z) return min_eigs # read mat filetype = None dataset = sys.argv[1] if dataset == "PSD": feats = np.random.random((1000,1000)) similarity_matrix = feats @ feats.T filetype = "numpy" if dataset == "mrpc" or dataset == "rte" or dataset == "stsb": filename = "../GYPSUM/"+dataset+"_predicts_0.npy" filetype = "python" if dataset == "twitter": similarity_matrix = read_mat_file(file_="./WordMoversEmbeddings/mat_files/twitter_K_set1.mat") if filetype == "python": similarity_matrix = read_file(filename) sample_size = int(sys.argv[2]) runs = 50 n_bins = 50 # check for similar rows or columns if dataset != "PSD": unique_rows, indices = np.unique(similarity_matrix, axis=0, return_index=True) similarity_matrix_O = similarity_matrix[indices][:, indices] # symmetrization similarity_matrix = (similarity_matrix_O + similarity_matrix_O.T) / 2.0
if return_type == "error": return np.linalg.norm(\ similarity_matrix - \ KS @ np.linalg.pinv(A) @ KS.T)\ / np.linalg.norm(similarity_matrix), min_eig ########################################################################################## step = 50 runs_ = 3 """ 20ng2_new_K_set1.mat oshumed_K_set1.mat recipe_K_set1.mat recipe_trainData.mat twitter_K_set1.mat twitter_set1.mat """ # filename = "stsb" id_count = 500 #len(similarity_matrix) #1000 similarity_matrix = read_mat_file(file_="WordMoversEmbeddings/mat_files/recipe_trainData.mat",\ version="v7.3") # similarity_matrix = read_file("../GYPSUM/"+filename+"_predicts_0.npy") # check for similar rows or columns unique_rows, indices = np.unique(similarity_matrix, axis=0, return_index=True) similarity_matrix_O = similarity_matrix[indices][:, indices] # symmetrization similarity_matrix = (similarity_matrix_O + similarity_matrix_O.T) / 2.0 multipliers = list(np.arange(1.0, 2.3, 0.5)) list_of_list_of_errors = [] list_of_min_eig_scaling = [] z_range = [1, 2, 5, 10] # eps=1e-16
from mlhmm import MLHMM from itertools import product from modshogun import StructuredAccuracy, HMSVMModel, SMT_TWO_STATE ### prepare training, test data and evaluator distort = 40 train_data_file = 'hmsvm_%d_distort_data_fold' % distort train_num_examples_fold = 20 train_num_folds = 5 train_labels, train_features = utils.unfold_data(train_data_file) test_data_file = 'hmsvm_%d_distort_data_test' % distort test_num_examples = 100 test_labels, test_features = utils.read_mat_file(test_data_file, test_num_examples) evaluator = StructuredAccuracy() ### train ML-HMM and evaluate in training data print 'training ML-HMM' model = HMSVMModel(train_features, train_labels, SMT_TWO_STATE) model.set_use_plifs(True) mlhmm = MLHMM(model) mlhmm.train() ''' print '\n\tmodel parameters:' print '\t- transition scores: ' + str(numpy.exp(mlhmm.transition_scores)) print '\t- feature scores:' for s,f in product(xrange(mlhmm.num_free_states), xrange(mlhmm.num_features)):
# number of features per example num_features = 10 # the number different label values num_states = 2 distorts = [0, 10, 20, 30, 40] for distort in distorts: # K models that will contain the data of each fold models = [] print '>>>> data with ' + str(distort) + '% of total labels distorted' data_file = 'hmsvm_%d_distort_data_fold' % distort for k in xrange(K): fold_data_file = '%s_%d' % (data_file, k) labels, features = utils.read_mat_file(fold_data_file, num_fold_examples) models.append(HMSVMModel(features, labels, SMT_TWO_STATE)) # check if checks: print 'running checks on simulated data' for k in xrange(K): labels = models[k].get_labels() features = RealMatrixFeatures.obtain_from_generic( models[k].get_features()) print '\tmodel %d with %d labels and %d features' % ( k, labels.get_num_labels(), features.get_num_vectors()) assert (labels.get_num_labels() == features.get_num_vectors()) for i in xrange(labels.get_num_labels()):
# number of features per example num_features = 10 # the number different label values num_states = 2 distorts = [0, 10, 20, 30, 40] for distort in distorts: # K models that will contain the data of each fold models = [] print '>>>> data with ' + str(distort) + '% of total labels distorted' data_file = 'hmsvm_%d_distort_data_fold' % distort; for k in xrange(K): fold_data_file = '%s_%d' % (data_file, k) labels, features = utils.read_mat_file(fold_data_file, num_fold_examples) models.append(HMSVMModel(features, labels, SMT_TWO_STATE)) # check if checks: print 'running checks on simulated data' for k in xrange(K): labels = models[k].get_labels() features = RealMatrixFeatures.obtain_from_generic(models[k].get_features()) print '\tmodel %d with %d labels and %d features' % (k, labels.get_num_labels(), features.get_num_vectors()) assert(labels.get_num_labels() == features.get_num_vectors()) for i in xrange(labels.get_num_labels()): label = Sequence.obtain_from_generic(labels.get_label(i))
step = 50 runs_ = 3 """ 20ng2_new_K_set1.mat oshumed_K_set1.mat recipe_K_set1.mat recipe_trainData.mat twitter_K_set1.mat twitter_set1.mat """ filetype = None dataset = sys.argv[1] if dataset == "PSD": feats = np.random.random((1000,1000)) similarity_matrix = feats @ feats.T filetype = "numpy" if dataset == "mrpc" or dataset == "rte" or dataset == "stsb": filename = "../GYPSUM/"+dataset+"_predicts_0.npy" filetype = "python" if dataset == "twitter": similarity_matrix = read_mat_file(file_="./WordMoversEmbeddings/mat_files/twitter_K_set1.mat") if dataset == "ohsumed": similarity_matrix = read_mat_file(file_="./WordMoversEmbeddings/mat_files/oshumed_K_set1.mat", version="v7.3") if dataset == "recipe": similarity_matrix = read_mat_file(file_="/mnt/nfs/work1/elm/ray/recipe_trainData.mat", version="v7.3") if dataset == "news": similarity_matrix = read_mat_file(file_="/mnt/nfs/work1/elm/ray/20ng2_new_K_set1.mat", version="v7.3") if filetype == "python": similarity_matrix = read_file(filename) # similarity_matrix = read_file("../GYPSUM/"+filename+"_predicts_0.npy") true_error = [] KS_corrected_error_list = [] KS_ncorrected_error_list = [] scaling_error_list = []