示例#1
0
def main():
    file1 = sys.argv[1]
    file2 = sys.argv[2]
    loaded_data1 = file_utils.load_hdf5(file1)
    loaded_data2 = file_utils.load_hdf5(file2)
    assert loaded_data1['dropped_indices'] == loaded_data2['dropped_indices']
    dropped_indices = loaded_data1['dropped_indices'][-1]
    dropped_elements = [(i, j) for i in dropped_indices for j in dropped_indices]
    
    gram1 = loaded_data1['gram_matrices'][-1]['completed_npsd']
    gram2 = loaded_data2['gram_matrices'][-1]['completed_npsd']
    
    errs = calculate_errors(gram1, gram2, dropped_elements)
    print("mse:%.10f, mse_dropped:%.10f, mae:%.10f, mae_dropped:%.10f, re:%.10f, re_dropped:%.10f" % errs)
    return
示例#2
0
 def test_save_and_load_hdf5(self):
     filename = "test.hdf5"
     fu.save_hdf5(filename, self.dic)
     dic = fu.load_hdf5(filename)
     print(dic)
     print(self.dic)
     self.__test_save_and_load_hdf5_rec(dic, self.dic)
示例#3
0
def main():
    """Read .pkl file, parse its metadata, plot Gram matrix and save as pdf file with matplotlib.
    """
    filename = os.path.abspath(sys.argv[1])
    title = sys.argv[2] if len(sys.argv) > 2 else ""

    if filename[-4:] == ".pkl":
        dat = file_utils.load_pickle(filename)
        filename_pdf_ = filename.replace(".pkl", ".pdf")
    elif filename[-5:] == ".hdf5":
        dat = file_utils.load_hdf5(filename)
        filename_pdf_ = filename.replace(".hdf5", ".pdf")
    else:
        assert False
    dataset_type = dat['dataset_type']
    gram_matrices = dat['gram_matrices']
    sample_names = dat['sample_names']

    labels, separators, dataset_name, rotate = get_informations(
        dataset_type, sample_names)

    matrices = gram_matrices[-1]
    for key in matrices.keys():
        filename_pdf = filename_pdf_.replace(".pdf", "_" + key + ".pdf")
        plot_title = title + " " + key.replace("_", " ")
        plot_gram_to_pdf(filename_pdf,
                         matrices[key],
                         sample_names,
                         separators,
                         labels,
                         dataset_name,
                         title=plot_title,
                         rotate_vertically=rotate)
示例#4
0
 def setUp(self):
     pickle_or_hdf5_location = "results/6DMG/30/t1/gram_upperChar_sigma30_triangularNone_t1_noaugmentation.hdf5"
     dataset_location = "/Users/ngym/Lorincz-Lab/project/fast_time-series_data_classification/dataset/6DMG_mat_112712/matR_char"
     
     loaded_data = file_utils.load_hdf5(os.path.abspath(pickle_or_hdf5_location))
     gram_matrices = loaded_data['gram_matrices']
     self.gram = gram_matrices[0]['original']
     self.sample_names = loaded_data['sample_names']
     self.lmbd = 0.5
     
     dataset_type = loaded_data['dataset_type']
     sample_names = [s.split('/')[-1].split('.')[0] for s in loaded_data['sample_names']]
     seqs, key_to_str, _ = read_sequences(dataset_type, direc=dataset_location)
     seqs = filter_samples(seqs, sample_names)
     key_to_str = filter_samples(key_to_str, self.sample_names)
     labels = list(key_to_str.values())
     tmp = list(labels)
     counter = Counter(tmp)
     #self.size_groups = [counter[label] for label in sorted(set(tmp), key=tmp.index)]
     self.size_groups = [15] * 26
示例#5
0
def main():
    """Read .mat file and plot Gram matrix to html with plotly.
    """
    filename = sys.argv[1]    
    if filename[-4:] == ".pkl":
        dat = file_utils.load_pickle(filename)
        filename_html_ = filename.replace(".pkl", ".html")
    elif filename[-5:] == ".hdf5":
        dat = file_utils.load_hdf5(filename)
        filename_html_ = filename.replace(".hdf5", ".html")
    else:
        assert False
    dataset_type = dat['dataset_type']
    gram_matrices = dat['gram_matrices']
    sample_names = dat['sample_names']

    matrices = gram_matrices[-1]
    for key in matrices.keys():
        filename_html = filename_html_.replace(".html", "_" + key + ".html")
        plot_gram_to_html(filename_html, matrices[key], sample_names)
示例#6
0
def run(pickle_or_hdf5_location, dataset_location, fold_count, fold_to_drop,
        algorithm, params, output_dir, output_filename_format, output_file):
    ########
    # Create output directory and backup the configuration file to the directory
    ########
    os.makedirs(output_dir, exist_ok=True)
    try:
        shutil.copy(os.path.abspath(sys.argv[2]),
                    os.path.join(output_dir, os.path.basename(sys.argv[2])))
    except shutil.SameFileError:
        pass
    hdf5 = pickle_or_hdf5_location[-4:] == "hdf5"
    check_fold(fold_count, fold_to_drop, hdf5)
    check_algorithm(algorithm)
    check_params(algorithm, params)

    pickle_or_hdf5_location = os.path.abspath(pickle_or_hdf5_location)
    dataset_location = os.path.abspath(dataset_location)
    output_dir = os.path.abspath(output_dir)
    assert os.path.isdir(output_dir)
    assert os.path.exists(pickle_or_hdf5_location)

    ########
    # Load complete GRAM matrix
    ########
    time_main_start = os.times()

    hdf5 = pickle_or_hdf5_location[-4:] == "hdf5"
    if hdf5:
        loaded_data = file_utils.load_hdf5(pickle_or_hdf5_location)
    else:
        loaded_data = file_utils.load_pickle(pickle_or_hdf5_location)
        check_pickle_format(loaded_data)

    dataset_type = loaded_data['dataset_type']
    if dataset_type == 'UCIauslan':
        loaded_sample_names = loaded_data['sample_names']
    else:
        loaded_sample_names = [
            s.split('/')[-1].split('.')[0] for s in loaded_data['sample_names']
        ]
    gram_matrices = loaded_data['gram_matrices']
    if len(gram_matrices) == 1:
        gram = gram_matrices[0]['original']
    else:
        gram = gram_matrices[-1]['completed_npsd']

    # drop elements
    if fold_count == 0:
        gram_drop = gram
    else:
        folds = k_fold_cross_validation.get_kfolds(dataset_type,
                                                   loaded_sample_names,
                                                   fold_count)
        indices_to_drop = folds[fold_to_drop - 1]
        gram_drop, dropped_elements = make_matrix_incomplete.gram_drop_samples(
            gram, indices_to_drop)

    ########
    # Prepare time-series data
    ########
    seqs, sample_names, labels_str, _ = read_sequences(dataset_type,
                                                       dataset_location)

    seqs = filter_samples(seqs, sample_names, loaded_sample_names)
    labels_str = filter_samples(labels_str, sample_names, loaded_sample_names)

    ########
    # Execute Matrix Completion
    ########
    train_start = None
    train_end = None
    if algorithm == "gak":
        ########
        # Baseline GAK
        ########
        gram_completed, time_completion_start, time_completion_end \
            = matrix_completion.gak_matrix_completion(
                gram_drop, seqs, indices_to_drop,
                sigma=params['sigma'], triangular=params['triangular'])
        action = "GAK sigma: " + str(params['sigma']) + " triangular: " + str(
            params['triangular'])
        output_filename_format = output_filename_format.replace(
            "${sigma}",
            str(params['sigma'])).replace("${triangular}",
                                          str(params['triangular']))
    elif algorithm in {"softimpute", "knn", "iterativesvd"}:
        ########
        # Baseline SoftImpute, KNN, IterativeSVD
        ########
        if algorithm == "softimpute":
            func = matrix_completion.softimpute_matrix_completion
            action = "Softimpute"
            print('running SoftImpute')
        elif algorithm == "knn":
            func = matrix_completion.knn_matrix_completion
            action = "KNN"
            print('running KNN')
        elif algorithm == "iterativesvd":
            func = matrix_completion.iterativesvd_matrix_completion
            action = "IterativeSVD"
            print('running IterativeSVD')
        else:
            print("unsupported fancyimpute algorithm")
            exit(-1)
        flag_test = np.zeros(len(seqs))
        flag_test[indices_to_drop] = 1
        drop_flag_matrix = create_true_GAK_flag_matrix(1 - params['gak_rate'],
                                                       flag_test)
        for i in range(len(seqs)):
            drop_flag_matrix[i, i] = 1
            for j in range(i + 1):
                if i not in indices_to_drop and j not in indices_to_drop:
                    drop_flag_matrix[i, j] = 1
                    drop_flag_matrix[j, i] = 1

        print(len(seqs)**2)
        print(np.count_nonzero(drop_flag_matrix))
        gram_completed, time_completion_start, time_completion_end \
            = func(gram_drop,
                   seqs,
                   sigma=params['sigma'],
                   triangular=params['triangular'],
                   num_process=params['num_process'],
                   drop_flag_matrix=drop_flag_matrix)
    elif algorithm == "rnn":
        ########
        # Our Scheme, Siamese Recurrent Neural Network
        ########
        modelfile_hdf5 = os.path.join(output_dir,
                                      output_filename_format + "_model.hdf5")
        logfile_loss = os.path.join(output_dir,
                                    output_filename_format + ".losses")
        gram_completed, time_train_start, time_train_end, \
            time_completion_start, time_completion_end \
            = matrix_completion.rnn_matrix_completion(
                gram_drop,
                seqs,
                params['epochs'],
                params['patience'],
                params['epoch_start_from'],
                logfile_loss,
                modelfile_hdf5,
                params['rnn'],
                params['rnn_units'],
                params['dense_units'],
                params['dropout'],
                params['implementation'],
                params['bidirectional'],
                params['batchnormalization'],
                params['mode'],
                params['loss_function'],
                params['loss_weight_ratio'],
                labels_str,
                params['siamese_joint_method'],
                params['siamese_arms_activation'],
                trained_modelfile_hdf5=params['trained_modelfile_hdf5'])
        action = "SiameseRNN"
    elif algorithm == "fast_rnn":
        ########
        # Our Scheme, Fast Siamese Recurrent Neural Network
        ########
        modelfile_hdf5 = os.path.join(output_dir,
                                      output_filename_format + "_model.hdf5")
        logfile_loss = os.path.join(output_dir,
                                    output_filename_format + ".losses")
        gram_completed, time_completion_start, time_completion_end \
            = matrix_completion.fast_rnn_matrix_completion(
                gram_drop,
                seqs,
                params['rnn'],
                params['rnn_units'],
                params['dense_units'],
                params['dropout'],
                params['implementation'],
                params['bidirectional'],
                params['batchnormalization'],
                params['loss_function'],
                params['siamese_arms_activation'],
                params['siamese_joint_method'],
                trained_modelfile_hdf5=params['trained_modelfile_hdf5'])
        action = "FastSiameseRNN"
    else:
        assert False

    ########
    # Make the completed matrix positive semidefinite, if it is not.
    ########

    # eigenvalue check
    time_npsd_start = os.times()
    gram_completed_npsd = nearest_positive_semidefinite.nearest_positive_semidefinite(
        gram_completed)
    time_npsd_end = os.times()

    ########
    # Save results
    ########
    if hdf5:
        log_file = os.path.join(output_dir, output_filename_format + ".hdf5")
    else:
        log_file = os.path.join(output_dir, output_filename_format + ".pkl")
    action += " " + time.asctime(time.localtime())
    file_utils.append_and_save_result(log_file,
                                      loaded_data,
                                      gram_drop,
                                      gram_completed,
                                      gram_completed_npsd,
                                      indices_to_drop,
                                      action,
                                      hdf5=hdf5)

    # claculate errors
    mse, mse_dropped, mae, mae_dropped, \
        relative, relative_dropped = calculate_errors(gram, gram_completed_npsd, dropped_elements)

    time_main_end = os.times()

    # save run times and errors
    num_calculated_elements = len(dropped_elements) - len(indices_to_drop) // 2
    num_dropped_sequences = len(indices_to_drop)
    out_path = os.path.join(output_dir, output_file)
    file_utils.save_analysis(out_path, len(dropped_elements),
                             num_dropped_sequences, num_calculated_elements,
                             time_completion_start, time_completion_end,
                             time_npsd_start, time_npsd_end, time_main_start,
                             time_main_end, mse, mse_dropped, mae, mae_dropped,
                             relative, relative_dropped)
示例#7
0
def run(pickle_or_hdf5_location, dataset_location, fold_to_test, fold_to_tv,
        fold_count, params,
        output_dir, output_filename_format, data_augmentation_size):
    os.makedirs(output_dir, exist_ok=True)
    shutil.copy(os.path.abspath(sys.argv[2]), os.path.join(output_dir, os.path.basename(sys.argv[2])))
    hdf5 = pickle_or_hdf5_location[-4:] == "hdf5"
    if hdf5:
        loaded_data = file_utils.load_hdf5(os.path.abspath(pickle_or_hdf5_location))
    else:
        loaded_data = file_utils.load_pickle(os.path.abspath(pickle_or_hdf5_location))

    dataset_type = loaded_data['dataset_type']
    sample_names = [s.split('/')[-1].split('.')[0] for s in loaded_data['sample_names']]

    gram_matrices = loaded_data['gram_matrices']
    gram = gram_matrices[0]['original']
    
    sample_names = loaded_data['sample_names']
    
    folds = k_fold_cross_validation.get_kfolds(dataset_type, sample_names, fold_count)
    folds = np.array(folds)
    test_indices = np.concatenate(folds[fold_to_test])
    tv_indices = np.concatenate(folds[fold_to_tv])
    fold_for_gram = np.delete(np.arange(fold_count), fold_to_test + fold_to_tv)
    gram_indices = np.concatenate(folds[fold_for_gram]).astype(int)
    
    seqs, key_to_str, _ = read_sequences(dataset_type, dataset_location)
    augmentation_magnification = 1.2
    seqs, key_to_str, flag_augmented = augment_data(seqs, key_to_str,
                                                    augmentation_magnification,
                                                    rand_uniform=True,
                                                    num_normaldist_ave=data_augmentation_size - 2)

    
    seqs = filter_samples(seqs, sample_names)
    key_to_str = filter_samples(key_to_str, sample_names)

    logfile_hdf5 = os.path.join(output_dir, output_filename_format + "_model.hdf5")
    logfile_loss = os.path.join(output_dir, output_filename_format + ".losses")
    output_file  = os.path.join(output_dir, output_filename_format + ".json")
    
    (roc_auc_score, f1_score) = KSS_unsupervised_alpha_prediction.get_classification_error(
        gram,
        gram_indices,
        tv_indices,
        test_indices,
        list(seqs.values()),
        params['epochs'],
        params['patience'],
        logfile_hdf5,
        logfile_loss,
        params['rnn'],
        params['rnn_units'],
        params['dense_units'],
        params['dropout'],
        params['implementation'],
        params['bidirectional'],
        params['batchnormalization'],
        params['mode'],
        list(key_to_str.values()),
        params['lmbd'],
        params['top_activation'])

    print(pickle_or_hdf5_location + " roc_auc_score: " + str(roc_auc_score) + " f1_score: " + str(f1_score))
    dic = dict(roc_auc_score=roc_auc_score,
               f1_score=f1_score)
    
    file_utils.save_json(output_file, dic)
示例#8
0
def run(pickle_or_hdf5_location, dataset_location, fold_count, fold_to_drop,
        params, output_dir, output_filename_format, output_file,
        data_augmentation_size):
    os.makedirs(output_dir, exist_ok=True)
    try:
        shutil.copy(os.path.abspath(sys.argv[2]),
                    os.path.join(output_dir, os.path.basename(sys.argv[2])))
    except shutil.SameFileError:
        pass
    hdf5 = pickle_or_hdf5_location[-4:] == "hdf5"
    check_fold(fold_count, fold_to_drop, hdf5)

    pickle_or_hdf5_location = os.path.abspath(pickle_or_hdf5_location)
    dataset_location = os.path.abspath(dataset_location)
    output_dir = os.path.abspath(output_dir)
    assert os.path.isdir(output_dir)
    assert os.path.exists(pickle_or_hdf5_location)

    main_start = os.times()

    hdf5 = pickle_or_hdf5_location[-4:] == "hdf5"
    if hdf5:
        loaded_data = file_utils.load_hdf5(pickle_or_hdf5_location)
    else:
        loaded_data = file_utils.load_pickle(pickle_or_hdf5_location)

    dataset_type = loaded_data['dataset_type']
    if dataset_type == 'UCIauslan':
        loaded_sample_names = loaded_data['sample_names']
    else:
        loaded_sample_names = [
            s.split('/')[-1].split('.')[0] for s in loaded_data['sample_names']
        ]
    gram_matrices = loaded_data['gram_matrices']
    if len(gram_matrices) == 1:
        gram = gram_matrices[0]['original']
    else:
        gram = gram_matrices[-1]['completed_npsd']

    # drop elements
    if fold_count == 0:
        gram_drop = gram
    else:
        folds = k_fold_cross_validation.get_kfolds(dataset_type,
                                                   loaded_sample_names,
                                                   fold_count)
        indices_to_drop = folds[fold_to_drop - 1]
        gram_drop, dropped_elements = make_matrix_incomplete.gram_drop_samples(
            gram, indices_to_drop)

    seqs, sample_names, labels_str, _ = read_sequences(dataset_type,
                                                       dataset_location)

    seqs = filter_samples(seqs, sample_names, loaded_sample_names)
    labels_str = filter_samples(labels_str, sample_names, loaded_sample_names)

    train_start = None
    train_end = None

    modelfile_hdf5 = os.path.join(output_dir,
                                  output_filename_format + "_model.hdf5")
    logfile_loss = os.path.join(output_dir, output_filename_format + ".losses")

    # pre-processing
    num_seqs = len(seqs)
    time_dim = max([seq.shape[0] for seq in seqs])
    pad_value = -4444
    seqs = pad_sequences([seq.tolist() for seq in seqs],
                         maxlen=time_dim,
                         dtype='float32',
                         padding='post',
                         value=pad_value)
    feat_dim = seqs[0].shape[1]
    input_shape = (time_dim, feat_dim)

    K.clear_session()

    # build network
    model = siamese_rnn_branch.SiameseRnnBranch(
        input_shape,
        pad_value,
        params['rnn_units'],
        params['dense_units'],
        params['rnn'],
        params['dropout'],
        params['implementation'],
        params['bidirectional'],
        params['batchnormalization'],
        params['loss_function'],
        params['siamese_joint_method'],
        params['trained_modelfile_hdf5'],
        siamese_arms_activation=params['siamese_arms_activation'])

    test_indices = indices_to_drop
    train_validation_indices = np.delete(np.arange(len(seqs)), test_indices)

    train_validation_seqs = seqs[train_validation_indices]
    test_seqs = seqs[test_indices]

    train_validation_features = model.predict(train_validation_seqs)

    time_pred_start = os.times()
    test_features = model.predict(test_seqs)
    time_pred_end = os.times()

    labels = np.array(labels_str)
    train_validation_labels = labels[train_validation_indices]
    test_labels = labels[test_indices]


    auc, f1, time_classification_start, time_classification_end = \
                    linear_svm.compute_classification_errors(train_validation_features,
                                                             train_validation_labels,
                                                             test_features,
                                                             test_labels)

    main_end = os.times()

    num_calculated_sequences = len(test_seqs)

    virtual_prediction_duration = time_pred_end.user - time_pred_start.user + time_pred_end.system - time_pred_start.system
    elapsed_prediction_duration = time_pred_end.elapsed - time_pred_start.elapsed

    virtual_classification_duration = time_classification_end.user - time_classification_start.user + time_classification_end.system - time_classification_start.system
    elapsed_classification_duration = time_classification_end.elapsed - time_classification_start.elapsed

    prediction = {}

    prediction['basics'] = {}
    prediction['basics']['number_of_calculated_sequences'] = len(test_seqs)

    prediction['all'] = {}
    prediction['all'][
        'virtual_prediction_duration'] = virtual_prediction_duration
    prediction['all'][
        'elapsed_prediction_duration'] = elapsed_prediction_duration

    prediction['each_seq'] = {}
    prediction['each_seq'][
        'virtual_prediction_duration_per_calculated_sequence'] = virtual_prediction_duration / num_calculated_sequences
    prediction['each_seq'][
        'elapsed_prediction_duration_per_calculated_sequence'] = elapsed_prediction_duration / num_calculated_sequences

    classification = {}

    classification['basics'] = {}
    classification['basics']['roc_auc'] = auc
    classification['basics']['f1'] = f1

    classification['all'] = {}
    classification['all'][
        'virtual_classification_duration'] = virtual_classification_duration
    classification['all'][
        'elapsed_classification_duration'] = elapsed_classification_duration

    classification['each_seq'] = {}
    classification['each_seq'][
        'virtual_classification_duration_per_calculated_sequence'] = virtual_classification_duration / num_calculated_sequences
    classification['each_seq'][
        'elapsed_classification_duration_per_calculated_sequence'] = elapsed_classification_duration / num_calculated_sequences

    dic = dict(prediction=prediction, classification=classification)

    ###
    lsvm_out_path = os.path.join(output_dir, output_file)
    file_utils.save_json(lsvm_out_path, dic)