def main(): file1 = sys.argv[1] file2 = sys.argv[2] loaded_data1 = file_utils.load_hdf5(file1) loaded_data2 = file_utils.load_hdf5(file2) assert loaded_data1['dropped_indices'] == loaded_data2['dropped_indices'] dropped_indices = loaded_data1['dropped_indices'][-1] dropped_elements = [(i, j) for i in dropped_indices for j in dropped_indices] gram1 = loaded_data1['gram_matrices'][-1]['completed_npsd'] gram2 = loaded_data2['gram_matrices'][-1]['completed_npsd'] errs = calculate_errors(gram1, gram2, dropped_elements) print("mse:%.10f, mse_dropped:%.10f, mae:%.10f, mae_dropped:%.10f, re:%.10f, re_dropped:%.10f" % errs) return
def test_save_and_load_hdf5(self): filename = "test.hdf5" fu.save_hdf5(filename, self.dic) dic = fu.load_hdf5(filename) print(dic) print(self.dic) self.__test_save_and_load_hdf5_rec(dic, self.dic)
def main(): """Read .pkl file, parse its metadata, plot Gram matrix and save as pdf file with matplotlib. """ filename = os.path.abspath(sys.argv[1]) title = sys.argv[2] if len(sys.argv) > 2 else "" if filename[-4:] == ".pkl": dat = file_utils.load_pickle(filename) filename_pdf_ = filename.replace(".pkl", ".pdf") elif filename[-5:] == ".hdf5": dat = file_utils.load_hdf5(filename) filename_pdf_ = filename.replace(".hdf5", ".pdf") else: assert False dataset_type = dat['dataset_type'] gram_matrices = dat['gram_matrices'] sample_names = dat['sample_names'] labels, separators, dataset_name, rotate = get_informations( dataset_type, sample_names) matrices = gram_matrices[-1] for key in matrices.keys(): filename_pdf = filename_pdf_.replace(".pdf", "_" + key + ".pdf") plot_title = title + " " + key.replace("_", " ") plot_gram_to_pdf(filename_pdf, matrices[key], sample_names, separators, labels, dataset_name, title=plot_title, rotate_vertically=rotate)
def setUp(self): pickle_or_hdf5_location = "results/6DMG/30/t1/gram_upperChar_sigma30_triangularNone_t1_noaugmentation.hdf5" dataset_location = "/Users/ngym/Lorincz-Lab/project/fast_time-series_data_classification/dataset/6DMG_mat_112712/matR_char" loaded_data = file_utils.load_hdf5(os.path.abspath(pickle_or_hdf5_location)) gram_matrices = loaded_data['gram_matrices'] self.gram = gram_matrices[0]['original'] self.sample_names = loaded_data['sample_names'] self.lmbd = 0.5 dataset_type = loaded_data['dataset_type'] sample_names = [s.split('/')[-1].split('.')[0] for s in loaded_data['sample_names']] seqs, key_to_str, _ = read_sequences(dataset_type, direc=dataset_location) seqs = filter_samples(seqs, sample_names) key_to_str = filter_samples(key_to_str, self.sample_names) labels = list(key_to_str.values()) tmp = list(labels) counter = Counter(tmp) #self.size_groups = [counter[label] for label in sorted(set(tmp), key=tmp.index)] self.size_groups = [15] * 26
def main(): """Read .mat file and plot Gram matrix to html with plotly. """ filename = sys.argv[1] if filename[-4:] == ".pkl": dat = file_utils.load_pickle(filename) filename_html_ = filename.replace(".pkl", ".html") elif filename[-5:] == ".hdf5": dat = file_utils.load_hdf5(filename) filename_html_ = filename.replace(".hdf5", ".html") else: assert False dataset_type = dat['dataset_type'] gram_matrices = dat['gram_matrices'] sample_names = dat['sample_names'] matrices = gram_matrices[-1] for key in matrices.keys(): filename_html = filename_html_.replace(".html", "_" + key + ".html") plot_gram_to_html(filename_html, matrices[key], sample_names)
def run(pickle_or_hdf5_location, dataset_location, fold_count, fold_to_drop, algorithm, params, output_dir, output_filename_format, output_file): ######## # Create output directory and backup the configuration file to the directory ######## os.makedirs(output_dir, exist_ok=True) try: shutil.copy(os.path.abspath(sys.argv[2]), os.path.join(output_dir, os.path.basename(sys.argv[2]))) except shutil.SameFileError: pass hdf5 = pickle_or_hdf5_location[-4:] == "hdf5" check_fold(fold_count, fold_to_drop, hdf5) check_algorithm(algorithm) check_params(algorithm, params) pickle_or_hdf5_location = os.path.abspath(pickle_or_hdf5_location) dataset_location = os.path.abspath(dataset_location) output_dir = os.path.abspath(output_dir) assert os.path.isdir(output_dir) assert os.path.exists(pickle_or_hdf5_location) ######## # Load complete GRAM matrix ######## time_main_start = os.times() hdf5 = pickle_or_hdf5_location[-4:] == "hdf5" if hdf5: loaded_data = file_utils.load_hdf5(pickle_or_hdf5_location) else: loaded_data = file_utils.load_pickle(pickle_or_hdf5_location) check_pickle_format(loaded_data) dataset_type = loaded_data['dataset_type'] if dataset_type == 'UCIauslan': loaded_sample_names = loaded_data['sample_names'] else: loaded_sample_names = [ s.split('/')[-1].split('.')[0] for s in loaded_data['sample_names'] ] gram_matrices = loaded_data['gram_matrices'] if len(gram_matrices) == 1: gram = gram_matrices[0]['original'] else: gram = gram_matrices[-1]['completed_npsd'] # drop elements if fold_count == 0: gram_drop = gram else: folds = k_fold_cross_validation.get_kfolds(dataset_type, loaded_sample_names, fold_count) indices_to_drop = folds[fold_to_drop - 1] gram_drop, dropped_elements = make_matrix_incomplete.gram_drop_samples( gram, indices_to_drop) ######## # Prepare time-series data ######## seqs, sample_names, labels_str, _ = read_sequences(dataset_type, dataset_location) seqs = filter_samples(seqs, sample_names, loaded_sample_names) labels_str = filter_samples(labels_str, sample_names, loaded_sample_names) ######## # Execute Matrix Completion ######## train_start = None train_end = None if algorithm == "gak": ######## # Baseline GAK ######## gram_completed, time_completion_start, time_completion_end \ = matrix_completion.gak_matrix_completion( gram_drop, seqs, indices_to_drop, sigma=params['sigma'], triangular=params['triangular']) action = "GAK sigma: " + str(params['sigma']) + " triangular: " + str( params['triangular']) output_filename_format = output_filename_format.replace( "${sigma}", str(params['sigma'])).replace("${triangular}", str(params['triangular'])) elif algorithm in {"softimpute", "knn", "iterativesvd"}: ######## # Baseline SoftImpute, KNN, IterativeSVD ######## if algorithm == "softimpute": func = matrix_completion.softimpute_matrix_completion action = "Softimpute" print('running SoftImpute') elif algorithm == "knn": func = matrix_completion.knn_matrix_completion action = "KNN" print('running KNN') elif algorithm == "iterativesvd": func = matrix_completion.iterativesvd_matrix_completion action = "IterativeSVD" print('running IterativeSVD') else: print("unsupported fancyimpute algorithm") exit(-1) flag_test = np.zeros(len(seqs)) flag_test[indices_to_drop] = 1 drop_flag_matrix = create_true_GAK_flag_matrix(1 - params['gak_rate'], flag_test) for i in range(len(seqs)): drop_flag_matrix[i, i] = 1 for j in range(i + 1): if i not in indices_to_drop and j not in indices_to_drop: drop_flag_matrix[i, j] = 1 drop_flag_matrix[j, i] = 1 print(len(seqs)**2) print(np.count_nonzero(drop_flag_matrix)) gram_completed, time_completion_start, time_completion_end \ = func(gram_drop, seqs, sigma=params['sigma'], triangular=params['triangular'], num_process=params['num_process'], drop_flag_matrix=drop_flag_matrix) elif algorithm == "rnn": ######## # Our Scheme, Siamese Recurrent Neural Network ######## modelfile_hdf5 = os.path.join(output_dir, output_filename_format + "_model.hdf5") logfile_loss = os.path.join(output_dir, output_filename_format + ".losses") gram_completed, time_train_start, time_train_end, \ time_completion_start, time_completion_end \ = matrix_completion.rnn_matrix_completion( gram_drop, seqs, params['epochs'], params['patience'], params['epoch_start_from'], logfile_loss, modelfile_hdf5, params['rnn'], params['rnn_units'], params['dense_units'], params['dropout'], params['implementation'], params['bidirectional'], params['batchnormalization'], params['mode'], params['loss_function'], params['loss_weight_ratio'], labels_str, params['siamese_joint_method'], params['siamese_arms_activation'], trained_modelfile_hdf5=params['trained_modelfile_hdf5']) action = "SiameseRNN" elif algorithm == "fast_rnn": ######## # Our Scheme, Fast Siamese Recurrent Neural Network ######## modelfile_hdf5 = os.path.join(output_dir, output_filename_format + "_model.hdf5") logfile_loss = os.path.join(output_dir, output_filename_format + ".losses") gram_completed, time_completion_start, time_completion_end \ = matrix_completion.fast_rnn_matrix_completion( gram_drop, seqs, params['rnn'], params['rnn_units'], params['dense_units'], params['dropout'], params['implementation'], params['bidirectional'], params['batchnormalization'], params['loss_function'], params['siamese_arms_activation'], params['siamese_joint_method'], trained_modelfile_hdf5=params['trained_modelfile_hdf5']) action = "FastSiameseRNN" else: assert False ######## # Make the completed matrix positive semidefinite, if it is not. ######## # eigenvalue check time_npsd_start = os.times() gram_completed_npsd = nearest_positive_semidefinite.nearest_positive_semidefinite( gram_completed) time_npsd_end = os.times() ######## # Save results ######## if hdf5: log_file = os.path.join(output_dir, output_filename_format + ".hdf5") else: log_file = os.path.join(output_dir, output_filename_format + ".pkl") action += " " + time.asctime(time.localtime()) file_utils.append_and_save_result(log_file, loaded_data, gram_drop, gram_completed, gram_completed_npsd, indices_to_drop, action, hdf5=hdf5) # claculate errors mse, mse_dropped, mae, mae_dropped, \ relative, relative_dropped = calculate_errors(gram, gram_completed_npsd, dropped_elements) time_main_end = os.times() # save run times and errors num_calculated_elements = len(dropped_elements) - len(indices_to_drop) // 2 num_dropped_sequences = len(indices_to_drop) out_path = os.path.join(output_dir, output_file) file_utils.save_analysis(out_path, len(dropped_elements), num_dropped_sequences, num_calculated_elements, time_completion_start, time_completion_end, time_npsd_start, time_npsd_end, time_main_start, time_main_end, mse, mse_dropped, mae, mae_dropped, relative, relative_dropped)
def run(pickle_or_hdf5_location, dataset_location, fold_to_test, fold_to_tv, fold_count, params, output_dir, output_filename_format, data_augmentation_size): os.makedirs(output_dir, exist_ok=True) shutil.copy(os.path.abspath(sys.argv[2]), os.path.join(output_dir, os.path.basename(sys.argv[2]))) hdf5 = pickle_or_hdf5_location[-4:] == "hdf5" if hdf5: loaded_data = file_utils.load_hdf5(os.path.abspath(pickle_or_hdf5_location)) else: loaded_data = file_utils.load_pickle(os.path.abspath(pickle_or_hdf5_location)) dataset_type = loaded_data['dataset_type'] sample_names = [s.split('/')[-1].split('.')[0] for s in loaded_data['sample_names']] gram_matrices = loaded_data['gram_matrices'] gram = gram_matrices[0]['original'] sample_names = loaded_data['sample_names'] folds = k_fold_cross_validation.get_kfolds(dataset_type, sample_names, fold_count) folds = np.array(folds) test_indices = np.concatenate(folds[fold_to_test]) tv_indices = np.concatenate(folds[fold_to_tv]) fold_for_gram = np.delete(np.arange(fold_count), fold_to_test + fold_to_tv) gram_indices = np.concatenate(folds[fold_for_gram]).astype(int) seqs, key_to_str, _ = read_sequences(dataset_type, dataset_location) augmentation_magnification = 1.2 seqs, key_to_str, flag_augmented = augment_data(seqs, key_to_str, augmentation_magnification, rand_uniform=True, num_normaldist_ave=data_augmentation_size - 2) seqs = filter_samples(seqs, sample_names) key_to_str = filter_samples(key_to_str, sample_names) logfile_hdf5 = os.path.join(output_dir, output_filename_format + "_model.hdf5") logfile_loss = os.path.join(output_dir, output_filename_format + ".losses") output_file = os.path.join(output_dir, output_filename_format + ".json") (roc_auc_score, f1_score) = KSS_unsupervised_alpha_prediction.get_classification_error( gram, gram_indices, tv_indices, test_indices, list(seqs.values()), params['epochs'], params['patience'], logfile_hdf5, logfile_loss, params['rnn'], params['rnn_units'], params['dense_units'], params['dropout'], params['implementation'], params['bidirectional'], params['batchnormalization'], params['mode'], list(key_to_str.values()), params['lmbd'], params['top_activation']) print(pickle_or_hdf5_location + " roc_auc_score: " + str(roc_auc_score) + " f1_score: " + str(f1_score)) dic = dict(roc_auc_score=roc_auc_score, f1_score=f1_score) file_utils.save_json(output_file, dic)
def run(pickle_or_hdf5_location, dataset_location, fold_count, fold_to_drop, params, output_dir, output_filename_format, output_file, data_augmentation_size): os.makedirs(output_dir, exist_ok=True) try: shutil.copy(os.path.abspath(sys.argv[2]), os.path.join(output_dir, os.path.basename(sys.argv[2]))) except shutil.SameFileError: pass hdf5 = pickle_or_hdf5_location[-4:] == "hdf5" check_fold(fold_count, fold_to_drop, hdf5) pickle_or_hdf5_location = os.path.abspath(pickle_or_hdf5_location) dataset_location = os.path.abspath(dataset_location) output_dir = os.path.abspath(output_dir) assert os.path.isdir(output_dir) assert os.path.exists(pickle_or_hdf5_location) main_start = os.times() hdf5 = pickle_or_hdf5_location[-4:] == "hdf5" if hdf5: loaded_data = file_utils.load_hdf5(pickle_or_hdf5_location) else: loaded_data = file_utils.load_pickle(pickle_or_hdf5_location) dataset_type = loaded_data['dataset_type'] if dataset_type == 'UCIauslan': loaded_sample_names = loaded_data['sample_names'] else: loaded_sample_names = [ s.split('/')[-1].split('.')[0] for s in loaded_data['sample_names'] ] gram_matrices = loaded_data['gram_matrices'] if len(gram_matrices) == 1: gram = gram_matrices[0]['original'] else: gram = gram_matrices[-1]['completed_npsd'] # drop elements if fold_count == 0: gram_drop = gram else: folds = k_fold_cross_validation.get_kfolds(dataset_type, loaded_sample_names, fold_count) indices_to_drop = folds[fold_to_drop - 1] gram_drop, dropped_elements = make_matrix_incomplete.gram_drop_samples( gram, indices_to_drop) seqs, sample_names, labels_str, _ = read_sequences(dataset_type, dataset_location) seqs = filter_samples(seqs, sample_names, loaded_sample_names) labels_str = filter_samples(labels_str, sample_names, loaded_sample_names) train_start = None train_end = None modelfile_hdf5 = os.path.join(output_dir, output_filename_format + "_model.hdf5") logfile_loss = os.path.join(output_dir, output_filename_format + ".losses") # pre-processing num_seqs = len(seqs) time_dim = max([seq.shape[0] for seq in seqs]) pad_value = -4444 seqs = pad_sequences([seq.tolist() for seq in seqs], maxlen=time_dim, dtype='float32', padding='post', value=pad_value) feat_dim = seqs[0].shape[1] input_shape = (time_dim, feat_dim) K.clear_session() # build network model = siamese_rnn_branch.SiameseRnnBranch( input_shape, pad_value, params['rnn_units'], params['dense_units'], params['rnn'], params['dropout'], params['implementation'], params['bidirectional'], params['batchnormalization'], params['loss_function'], params['siamese_joint_method'], params['trained_modelfile_hdf5'], siamese_arms_activation=params['siamese_arms_activation']) test_indices = indices_to_drop train_validation_indices = np.delete(np.arange(len(seqs)), test_indices) train_validation_seqs = seqs[train_validation_indices] test_seqs = seqs[test_indices] train_validation_features = model.predict(train_validation_seqs) time_pred_start = os.times() test_features = model.predict(test_seqs) time_pred_end = os.times() labels = np.array(labels_str) train_validation_labels = labels[train_validation_indices] test_labels = labels[test_indices] auc, f1, time_classification_start, time_classification_end = \ linear_svm.compute_classification_errors(train_validation_features, train_validation_labels, test_features, test_labels) main_end = os.times() num_calculated_sequences = len(test_seqs) virtual_prediction_duration = time_pred_end.user - time_pred_start.user + time_pred_end.system - time_pred_start.system elapsed_prediction_duration = time_pred_end.elapsed - time_pred_start.elapsed virtual_classification_duration = time_classification_end.user - time_classification_start.user + time_classification_end.system - time_classification_start.system elapsed_classification_duration = time_classification_end.elapsed - time_classification_start.elapsed prediction = {} prediction['basics'] = {} prediction['basics']['number_of_calculated_sequences'] = len(test_seqs) prediction['all'] = {} prediction['all'][ 'virtual_prediction_duration'] = virtual_prediction_duration prediction['all'][ 'elapsed_prediction_duration'] = elapsed_prediction_duration prediction['each_seq'] = {} prediction['each_seq'][ 'virtual_prediction_duration_per_calculated_sequence'] = virtual_prediction_duration / num_calculated_sequences prediction['each_seq'][ 'elapsed_prediction_duration_per_calculated_sequence'] = elapsed_prediction_duration / num_calculated_sequences classification = {} classification['basics'] = {} classification['basics']['roc_auc'] = auc classification['basics']['f1'] = f1 classification['all'] = {} classification['all'][ 'virtual_classification_duration'] = virtual_classification_duration classification['all'][ 'elapsed_classification_duration'] = elapsed_classification_duration classification['each_seq'] = {} classification['each_seq'][ 'virtual_classification_duration_per_calculated_sequence'] = virtual_classification_duration / num_calculated_sequences classification['each_seq'][ 'elapsed_classification_duration_per_calculated_sequence'] = elapsed_classification_duration / num_calculated_sequences dic = dict(prediction=prediction, classification=classification) ### lsvm_out_path = os.path.join(output_dir, output_file) file_utils.save_json(lsvm_out_path, dic)