def modelopt(nhmmstates, nhmmiter):

    nhmmstates = nhmmstates[0]
    nhmmiter = nhmmiter[0] * 10
    nfolds = 5
    hmmcovtype = 'full'

    print nhmmstates, nhmmiter

    # Load the dataset
    #static_train = np.load('/storage/hpc_anna/GMiC/Data/ECoGmixed/fourier/train_data.npy')
    dynamic_train = np.load('/storage/hpc_anna/GMiC/Data/ECoGmixed/preprocessed/train_data.npy')
    #static_test = np.load('/storage/hpc_anna/GMiC/Data/ECoGmixed/fourier/test_data.npy')
    #dynamic_test = np.load('/storage/hpc_anna/GMiC/Data/ECoGmixed/preprocessed/test_data.npy')
    labels_train = np.load('/storage/hpc_anna/GMiC/Data/ECoGmixed/preprocessed/train_labels.npy')
    #labels_test = np.load('/storage/hpc_anna/GMiC/Data/ECoGmixed/preprocessed/test_labels.npy')
    nsamples = dynamic_train.shape[0]

    # k-fold cross validation to obtain accuracy
    val_idx_list = np.array_split(range(nsamples), nfolds)
    scores = []
    for fid, val_idx in enumerate(val_idx_list):
        train_idx = list(set(range(nsamples)) - set(val_idx))
        print "Current fold is %d / %d" % (fid + 1, nfolds)

        # extract predictions using HMM on dynamic
        hmmcl = HMMClassifier()
        model_pos, model_neg = hmmcl.train(nhmmstates, nhmmiter, hmmcovtype, 
                                           dynamic_train[train_idx], labels_train[train_idx])
        scores.append(hmmcl.test(model_pos, model_neg, dynamic_train[val_idx], labels_train[val_idx]))
    
    print 'Result: %.4f' % np.mean(scores)
    return -np.mean(scores)
def function(nhmmstates, nestimators, nhmmiter):

    nhmmstates = nhmmstates[0]
    nestimators = nestimators[0] * 100
    nhmmiter = nhmmiter[0] * 10
    nfolds = 5
    hmmcovtype = "full"

    print nhmmstates, nestimators, nhmmiter

    # Load the dataset
    static_train = np.load("/storage/hpc_anna/GMiC/Data/ECoGmixed/fourier/train_data.npy")
    dynamic_train = np.load("/storage/hpc_anna/GMiC/Data/ECoGmixed/preprocessed/train_data.npy")
    static_val = np.load("/storage/hpc_anna/GMiC/Data/ECoGmixed/fourier/test_data.npy")
    dynamic_val = np.load("/storage/hpc_anna/GMiC/Data/ECoGmixed/preprocessed/test_data.npy")
    labels_train = np.load("/storage/hpc_anna/GMiC/Data/ECoGmixed/preprocessed/train_labels.npy")
    labels_val = np.load("/storage/hpc_anna/GMiC/Data/ECoGmixed/preprocessed/test_labels.npy")

    # Merge train and test
    static_all = np.concatenate((static_train, static_val), axis=0)
    dynamic_all = np.concatenate((dynamic_train, dynamic_val), axis=0)
    labels_all = np.concatenate((labels_train, labels_val), axis=0)
    nsamples = static_all.shape[0]

    # prepare where to store the ratios
    ratios_all_hmm = np.empty(len(labels_all))

    # split indices into folds
    enrich_idx_list = np.array_split(range(nsamples), nfolds)

    # run CV
    for fid, enrich_idx in enumerate(enrich_idx_list):
        train_idx = list(set(range(nsamples)) - set(enrich_idx))

        # extract predictions using HMM on dynamic
        hmmcl = HMMClassifier()
        model_pos, model_neg = hmmcl.train(
            nhmmstates, nhmmiter, hmmcovtype, dynamic_all[train_idx], labels_all[train_idx]
        )
        ratios_all_hmm[enrich_idx] = hmmcl.pos_neg_ratios(model_pos, model_neg, dynamic_all[enrich_idx])

    # dataset for hybrid learning
    enriched_by_hmm = np.concatenate((static_all, np.matrix(ratios_all_hmm).T), axis=1)

    # (2.) k-fold cross validation to obtain accuracy
    val_idx_list = np.array_split(range(nsamples), nfolds)
    scores = []
    for fid, val_idx in enumerate(val_idx_list):
        train_idx = list(set(range(nsamples)) - set(val_idx))

        # Hybrid on features enriched by HMM (3)
        rf = RandomForestClassifier(n_estimators=nestimators)
        rf.fit(enriched_by_hmm[train_idx], labels_all[train_idx])
        scores.append(rf.score(enriched_by_hmm[val_idx], labels_all[val_idx]))

    print "Result: %.4f" % np.mean(scores)
    return -np.mean(scores)
def bpic(nhmmstates, nhmmiter):

    nhmmstates = nhmmstates[0]
    nhmmiter = nhmmiter[0] * 10
    nfolds = 5
    hmmcovtype = 'full'

    print nhmmstates, nhmmiter

    # Load the dataset
    static_train = np.load(
        '/storage/hpc_anna/GMiC/Data/BPIChallenge/f1/preprocessed/train_static.npy'
    )
    dynamic_train = np.load(
        '/storage/hpc_anna/GMiC/Data/BPIChallenge/f1/preprocessed/train_dynamic.npy'
    )
    static_val = np.load(
        '/storage/hpc_anna/GMiC/Data/BPIChallenge/f1/preprocessed/test_static.npy'
    )
    dynamic_val = np.load(
        '/storage/hpc_anna/GMiC/Data/BPIChallenge/f1/preprocessed/test_dynamic.npy'
    )
    labels_train = np.load(
        '/storage/hpc_anna/GMiC/Data/BPIChallenge/f1/preprocessed/train_labels.npy'
    )
    labels_val = np.load(
        '/storage/hpc_anna/GMiC/Data/BPIChallenge/f1/preprocessed/test_labels.npy'
    )

    # Merge train and test
    static_all = np.concatenate((static_train, static_val), axis=0)
    dynamic_all = np.concatenate((dynamic_train, dynamic_val), axis=0)
    labels_all = np.concatenate((labels_train, labels_val), axis=0)
    nsamples = static_all.shape[0]

    # k-fold cross validation to obtain accuracy
    val_idx_list = np.array_split(range(nsamples), nfolds)
    scores = []
    for fid, val_idx in enumerate(val_idx_list):
        train_idx = list(set(range(nsamples)) - set(val_idx))

        # extract predictions using HMM on dynamic
        hmmcl = HMMClassifier()
        model_pos, model_neg = hmmcl.train(nhmmstates, nhmmiter, hmmcovtype,
                                           dynamic_all[train_idx],
                                           labels_all[train_idx])
        scores.append(
            hmmcl.test(model_pos, model_neg, dynamic_all[val_idx],
                       labels_all[val_idx]))

    print 'Result: %.4f' % np.mean(scores)
    return -np.mean(scores)
def modelopt(nhmmstates, nhmmiter):

    nhmmstates = nhmmstates[0]
    nhmmiter = nhmmiter[0] * 10
    nfolds = 5
    hmmcovtype = 'full'

    print nhmmstates, nhmmiter

    # Load the dataset
    #static_train = np.load('/storage/hpc_anna/GMiC/Data/ECoGmixed/fourier/train_data.npy')
    dynamic_train = np.load(
        '/storage/hpc_anna/GMiC/Data/ECoGmixed/preprocessed/train_data.npy')
    #static_test = np.load('/storage/hpc_anna/GMiC/Data/ECoGmixed/fourier/test_data.npy')
    #dynamic_test = np.load('/storage/hpc_anna/GMiC/Data/ECoGmixed/preprocessed/test_data.npy')
    labels_train = np.load(
        '/storage/hpc_anna/GMiC/Data/ECoGmixed/preprocessed/train_labels.npy')
    #labels_test = np.load('/storage/hpc_anna/GMiC/Data/ECoGmixed/preprocessed/test_labels.npy')
    nsamples = dynamic_train.shape[0]

    # k-fold cross validation to obtain accuracy
    val_idx_list = np.array_split(range(nsamples), nfolds)
    scores = []
    for fid, val_idx in enumerate(val_idx_list):
        train_idx = list(set(range(nsamples)) - set(val_idx))
        print "Current fold is %d / %d" % (fid + 1, nfolds)

        # extract predictions using HMM on dynamic
        hmmcl = HMMClassifier()
        model_pos, model_neg = hmmcl.train(nhmmstates, nhmmiter, hmmcovtype,
                                           dynamic_train[train_idx],
                                           labels_train[train_idx])
        scores.append(
            hmmcl.test(model_pos, model_neg, dynamic_train[val_idx],
                       labels_train[val_idx]))

    print 'Result: %.4f' % np.mean(scores)
    return -np.mean(scores)
def bpic(nhmmstates, nhmmiter):

    nhmmstates = nhmmstates[0]
    nhmmiter = nhmmiter[0] * 10
    nfolds = 5
    hmmcovtype = 'full'

    print nhmmstates, nhmmiter

    # Load the dataset
    static_train = np.load('/storage/hpc_anna/GMiC/Data/BPIChallenge/f1/preprocessed/train_static.npy')
    dynamic_train = np.load('/storage/hpc_anna/GMiC/Data/BPIChallenge/f1/preprocessed/train_dynamic.npy')
    static_val = np.load('/storage/hpc_anna/GMiC/Data/BPIChallenge/f1/preprocessed/test_static.npy')
    dynamic_val = np.load('/storage/hpc_anna/GMiC/Data/BPIChallenge/f1/preprocessed/test_dynamic.npy')
    labels_train = np.load('/storage/hpc_anna/GMiC/Data/BPIChallenge/f1/preprocessed/train_labels.npy')
    labels_val = np.load('/storage/hpc_anna/GMiC/Data/BPIChallenge/f1/preprocessed/test_labels.npy')
    
    # Merge train and test
    static_all = np.concatenate((static_train, static_val), axis=0)
    dynamic_all = np.concatenate((dynamic_train, dynamic_val), axis=0)
    labels_all = np.concatenate((labels_train, labels_val), axis=0)
    nsamples = static_all.shape[0]

    # k-fold cross validation to obtain accuracy
    val_idx_list = np.array_split(range(nsamples), nfolds)
    scores = []
    for fid, val_idx in enumerate(val_idx_list):
        train_idx = list(set(range(nsamples)) - set(val_idx))

        # extract predictions using HMM on dynamic
        hmmcl = HMMClassifier()
        model_pos, model_neg = hmmcl.train(nhmmstates, nhmmiter, hmmcovtype, 
                                           dynamic_all[train_idx], labels_all[train_idx])
        scores.append(hmmcl.test(model_pos, model_neg, dynamic_all[val_idx], labels_all[val_idx]))
    
    print 'Result: %.4f' % np.mean(scores)
    return -np.mean(scores)
"""

Generate HMM-based classifier on syn_lstm_wins synthetic dataset

"""

import numpy as np
from HMM.hmm_classifier import HMMClassifier

print 'Loading the dataset..'
train_data = np.load("/storage/hpc_anna/GMiC/Data/syn_lstm_wins/train_dynamic.npy")
train_labels = np.load("/storage/hpc_anna/GMiC/Data/syn_lstm_wins/train_labels.npy")
test_data = np.load("/storage/hpc_anna/GMiC/Data/syn_lstm_wins/test_dynamic.npy")
test_labels = np.load("/storage/hpc_anna/GMiC/Data/syn_lstm_wins/test_labels.npy")

print "Training HMM classifier..."
hmmcl = HMMClassifier()
model_pos, model_neg = hmmcl.train(2, 2, 'full', train_data, train_labels)
print hmmcl.test(model_pos, model_neg, test_data, test_labels)





# load the data
print "Reading data..."
train_data = np.load("/storage/hpc_anna/GMiC/Data/ECoG/preprocessed/train_data.npy")
train_labels = np.load("/storage/hpc_anna/GMiC/Data/ECoG/preprocessed/train_labels.npy")
test_data = np.load("/storage/hpc_anna/GMiC/Data/ECoG/preprocessed/test_data.npy")
test_labels = np.load("/storage/hpc_anna/GMiC/Data/ECoG/preprocessed/test_labels.npy")

# split the training data into two halves
#   fh stands for first half
#   sh stands for second half
print "Splitting data in two halves..."
fh_data, fh_labels, sh_data, sh_labels = DataHandler.split(0.5, train_data, train_labels)

# train HMM on first 50% of the training set
print "Training HMM classifier..."
hmmcl = HMMClassifier()
model_pos, model_neg = hmmcl.train(NSTATES, NITERS, fh_data, fh_labels)

# feed second 50% of the training set into the HMM to obtain
# pos/neg ratio for every sequence in the second half of the training set
print "Extracting ratios based on the HMM model..."
sh_ratios = hmmcl.pos_neg_ratios(model_pos, model_neg, sh_data)
test_ratios = hmmcl.pos_neg_ratios(model_pos, model_neg, test_data)

# apply fourier transform on the second 50% of the training set
print "Fouriering the second half of the dataset..."
fourier_sh_data = Fourier.data_to_fourier(sh_data)
fourier_test_data = Fourier.data_to_fourier(test_data)

# augment fourier results of the second 50% train with the ratios thus producing an enriched dataset
print "Merging Fourier features and HMM-based ratios..."
Пример #8
0
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 22 17:58:39 2015

@author: annaleontjeva
"""
from HMM.hmm_classifier import HMMClassifier
from DataNexus.datahandler import DataHandler
import numpy as np

train_data = np.load(
    "/storage/hpc_anna/GMiC/Data/ECoG/preprocessed/train_data.npy")
train_labels = np.load(
    "/storage/hpc_anna/GMiC/Data/ECoG/preprocessed/train_labels.npy")
test_data = np.load(
    "/storage/hpc_anna/GMiC/Data/ECoG/preprocessed/test_data.npy")
test_labels = np.load(
    "/storage/hpc_anna/GMiC/Data/ECoG/preprocessed/test_labels.npy")

hmmcl = HMMClassifier()
#model_pos, model_neg = hmmcl.train(20, 100, train_data, train_labels)
#print hmmcl.test_model(model_pos, model_neg, test_data, test_labels)
hmmcl.find_best_parameter(0.7, range(20, 31), 10, 5, train_data, train_labels)
static_val = np.load(
    '/storage/hpc_anna/GMiC/Data/ECoGmixed/fourier/test_data.npy')
dynamic_val = np.load(
    '/storage/hpc_anna/GMiC/Data/ECoGmixed/preprocessed/test_data.npy')
labels_train = np.load(
    '/storage/hpc_anna/GMiC/Data/ECoGmixed/preprocessed/train_labels.npy')
labels_val = np.load(
    '/storage/hpc_anna/GMiC/Data/ECoGmixed/preprocessed/test_labels.npy')
nsamples = dynamic_train.shape[0]

# split indices into folds
val_idx_list = np.array_split(range(nsamples), nfolds)

# run CV
scores = []
for fid, val_idx in enumerate(val_idx_list):
    print "Current fold is %d/%d" % (fid + 1, nfolds)
    train_idx = list(set(range(nsamples)) - set(val_idx))

    # train the model and report performance
    hmmcl = HMMClassifier()
    model_pos, model_neg = hmmcl.train(nstates, niter, 'full',
                                       dynamic_train[train_idx],
                                       labels_train[train_idx])
    scores.append(
        hmmcl.test(model_pos, model_neg, dynamic_train[val_idx],
                   labels_train[val_idx]))

print "===> (7) HMM with dynamic features on CV: %.4f (+/- %.4f) %s" % (
    np.mean(scores), np.std(scores), scores)
Пример #10
0
labels_all = np.concatenate((labels_train, labels_val), axis=0)
nsamples = static_all.shape[0]


#
# Cross-validation to collect enrichment features
#
visits_all_hmm = np.empty((len(labels_all), nstates * 2))

predict_idx_list = np.array_split(range(nsamples), nfolds)
for fid, predict_idx in enumerate(predict_idx_list):
    print "Current fold is %d" % fid
    train_idx = list(set(range(nsamples)) - set(predict_idx))
    
    # extract visit counts from HMM on dynamic
    hmmcl = HMMClassifier()
    model_pos, model_neg = hmmcl.train(nstates, niter, covtype, dynamic_all[train_idx], labels_all[train_idx])
    visits_pos, visits_neg = hmmcl.state_visits(model_pos, model_neg, dynamic_all[predict_idx])
    visits_all_hmm[predict_idx] = np.hstack((visits_pos, visits_neg))

# prepare the dataset
enriched_by_hmm = np.concatenate((static_all, visits_all_hmm), axis=1)

# split the data into training and test
train_idx = np.random.choice(range(0, nsamples), size=np.round(nsamples * 0.7, 0), replace=False)
test_idx = list(set(range(0, nsamples)) - set(train_idx))

# train hybrid on features enriched by HMM (3)
rf = RandomForestClassifier(n_estimators=nestimators)
rf.fit(enriched_by_hmm[train_idx], labels_all[train_idx])
print "===> (3) Hybrid (RF) on features (state visit counts) enriched by HMM: %.4f" % rf.score(enriched_by_hmm[test_idx], labels_all[test_idx])
Пример #11
0
dynamic_all = np.concatenate((dynamic_train, dynamic_val), axis=0)
labels_all = np.concatenate((labels_train, labels_val), axis=0)
nsamples = static_all.shape[0]

# split indices into folds
val_idx_list = np.array_split(range(nsamples), nfolds)

# run CV
scores_acc = []
scores_auc = []

for fid, val_idx in enumerate(val_idx_list):
    print "Current fold is %d" % fid
    train_idx = list(set(range(nsamples)) - set(val_idx))
    # HMM on dynamic features (7)
    hmmcl = HMMClassifier()
    dnm_train = dynamic_all[train_idx]
    lbls_train = labels_all[train_idx]
    dnm_val = dynamic_all[train_idx]
    lbls_val = labels_all[train_idx] 
    model_pos, model_neg = hmmcl.train(nhmmstates, nhmmiter, hmmcovtype, dnm_train, lbls_train)
    acc, auc = hmmcl.test(model_pos, model_neg, dnm_val, lbls_val)
    scores_acc.append(acc)
    scores_auc.append(auc)

print "===> (7) accuracy of HMM on dynamic features: %.4f (+/- %.4f) %s" % (np.mean(scores_acc), np.std(scores_acc), scores_acc)
print "===> (7)      auc of HMM on dynamic features: %.4f (+/- %.4f) %s" % (np.mean(scores_auc), np.std(scores_auc), scores_auc)



# run CV
for fid, predict_idx in enumerate(predict_idx_list):
    print "Current fold is %d / %d" % (fid + 1, nfolds)
    train_idx = list(set(range(nsamples)) - set(predict_idx))
    
    # extract predictions using RF on static
    print "Extracting predictions on static data with RF..."
    rf = RandomForestClassifier(n_estimators=nestimators)
    rf.fit(static_all[train_idx], labels_all[train_idx])
    predictions_all_rf[predict_idx] = rf.predict_log_proba(static_all[predict_idx])
    predictions_all_rf[predictions_all_rf == -inf] = np.min(predictions_all_rf[predictions_all_rf != -inf])

    # extract predictions using HMM on dynamic
    print "Extracting predictions on dynamic data with HMM..."
    hmmcl = HMMClassifier()
    model_pos, model_neg = hmmcl.train(nhmmstates, nhmmiter, hmmcovtype, dynamic_all[train_idx], labels_all[train_idx])
    predictions_all_hmm[predict_idx] = hmmcl.predict_log_proba(model_pos, model_neg, dynamic_all[predict_idx])
    ratios_all_hmm[predict_idx] = hmmcl.pos_neg_ratios(model_pos, model_neg, dynamic_all[predict_idx])

#
# Prepare combined datasets for the future experiments
#

# datasets for ensemble learning
predictions_combined_rf_hmm = np.concatenate((predictions_all_rf, predictions_all_hmm), axis=1)

# datasets for hybrid learning
enriched_by_hmm = np.concatenate((static_all, np.matrix(ratios_all_hmm).T), axis=1)

# dataset to confirm that RF on dynamic is not better than generative models on dynamic data
#
# Train enrichment models on trainA
#
print 'Training enrichment models...'

# extract predictions using RF on static
rf = RandomForestClassifier(n_estimators=nestimators)
rf.fit(trainA_static, trainA_labels)
predictions_trainB_rf = rf.predict_log_proba(trainB_static)
predictions_trainB_rf[predictions_trainB_rf == -inf] = np.min(predictions_trainB_rf[predictions_trainB_rf != -inf])
predictions_test_rf = rf.predict_log_proba(test_static)
predictions_test_rf[predictions_test_rf == -inf] = np.min(predictions_test_rf[predictions_test_rf != -inf])

# extract predictions using HMM on dynamic
hmmcl = HMMClassifier()
model_pos, model_neg = hmmcl.train(nhmmstates, nhmmiter, hmmcovtype, trainA_dynamic, trainA_labels)
predictions_trainB_hmm = hmmcl.predict_log_proba(model_pos, model_neg, trainB_dynamic)
ratios_trainB_hmm = hmmcl.pos_neg_ratios(model_pos, model_neg, trainB_dynamic)
predictions_test_hmm = hmmcl.predict_log_proba(model_pos, model_neg, test_dynamic)
ratios_test_hmm = hmmcl.pos_neg_ratios(model_pos, model_neg, test_dynamic)

# extract predictions using LSTM on dynamic
lstmcl = LSTMClassifier(lstmsize, lstmdropout, lstmoptim, lstmnepochs, lstmbatchsize)
model_pos, model_neg = lstmcl.train(trainA_dynamic, trainA_labels)
mse_pos, mse_neg = lstmcl.predict_mse(model_pos, model_neg, trainB_dynamic)
predictions_trainB_lstm = np.vstack((mse_pos, mse_neg)).T
ratios_trainB_lstm = lstmcl.pos_neg_ratios(model_pos, model_neg, trainB_dynamic)
mse_pos, mse_neg = lstmcl.predict_mse(model_pos, model_neg, test_dynamic)
predictions_test_lstm = np.vstack((mse_pos, mse_neg)).T
ratios_test_lstm = lstmcl.pos_neg_ratios(model_pos, model_neg, test_dynamic)
# prepare where to store the ratios
ratios_all_hmm = np.empty(len(labels_all))
predictions_all_hmm = np.empty((len(labels_all), 2))
predictions_all = np.empty((len(labels_all),))

# split indices into folds
enrich_idx_list = np.array_split(range(nsamples), nfolds)

# run CV for enrichment
for fid, enrich_idx in enumerate(enrich_idx_list):
    print "Current fold is %d/%d" % (fid + 1, nfolds)
    train_idx = list(set(range(nsamples)) - set(enrich_idx))

    # extract predictions using HMM on dynamic
    hmmcl = HMMClassifier()
    model_pos, model_neg = hmmcl.train(nhmmstates, nhmmiter, hmmcovtype, dynamic_all[train_idx], labels_all[train_idx])
    ratios_all_hmm[enrich_idx] = hmmcl.pos_neg_ratios(model_pos, model_neg, dynamic_all[enrich_idx])
    predictions_all_hmm[enrich_idx] = hmmcl.predict_log_proba(model_pos, model_neg, dynamic_all[enrich_idx])
    predictions_all[enrich_idx] = hmmcl.predict(hmmcl.tensor_to_list(dynamic_all[enrich_idx]), model_pos, model_neg)

# dataset for hybrid learning
dynamic_as_static = dynamic_all.reshape((dynamic_all.shape[0], dynamic_all.shape[1] * dynamic_all.shape[2]))
enriched_by_hmm = np.concatenate((dynamic_as_static, predictions_all_hmm), axis=1)


# k-fold cross validation to obtain accuracy
print "===> HMM on dynamic: %.4f" % hmmcl.accuracy(predictions_all, labels_all)

val_idx_list = np.array_split(range(nsamples), nfolds)
scores = []
labels_train = np.load("/storage/hpc_anna/GMiC/Data/syn_lstm_wins/train_labels.npy")
labels_val = np.load("/storage/hpc_anna/GMiC/Data/syn_lstm_wins/test_labels.npy")


#
# Sanity Checks
#
print "Expected performance of a lonely model is 0.75, of the joint model 1.0"

# static data with RF
rf = RandomForestClassifier(n_estimators=100)
rf.fit(static_train, labels_train)
print "Random Forest with static features on validation set: %.4f" % rf.score(static_val, labels_val)

# dynamic data with HMM
hmmcl = HMMClassifier()
model_pos, model_neg = hmmcl.train(3, 10, dynamic_train, labels_train)
print "HMM with dynamic features on validation set: %.4f" % hmmcl.test(model_pos, model_neg, dynamic_val, labels_val)

# dynamic data with LSTM
lstmcl = LSTMClassifier(2000, 0.5, "adagrad", 20)
model_pos, model_neg = lstmcl.train(dynamic_train, labels_train)
print "LSTM with dynamic features on validation set: %.4f" % lstmcl.test(model_pos, model_neg, dynamic_val, labels_val)

# dynamic data with RF
print "Training RF on the dynamic dataset..."
dynamic_as_static_train = dynamic_train.reshape(
    (dynamic_train.shape[0], dynamic_train.shape[1] * dynamic_train.shape[2])
)
dynamic_as_static_val = dynamic_val.reshape((dynamic_val.shape[0], dynamic_val.shape[1] * dynamic_val.shape[2]))
rf = RandomForestClassifier(n_estimators=100)
labels_all = np.concatenate((labels_train, labels_val), axis=0)
nsamples = static_all.shape[0]


#
# Cross-validation to collect enrichment features
#
visits_all_hmm = np.empty((len(labels_all), nstates * 2))

predict_idx_list = np.array_split(range(nsamples), nfolds)
for fid, predict_idx in enumerate(predict_idx_list):
    print "Current fold is %d" % fid
    train_idx = list(set(range(nsamples)) - set(predict_idx))

    # extract visit counts from HMM on dynamic
    hmmcl = HMMClassifier()
    model_pos, model_neg = hmmcl.train(nstates, niter, covtype, dynamic_all[train_idx], labels_all[train_idx])
    visits_pos, visits_neg = hmmcl.state_visits(model_pos, model_neg, dynamic_all[predict_idx])
    visits_all_hmm[predict_idx] = np.hstack((visits_pos, visits_neg))

# prepare the dataset
enriched_by_hmm = np.concatenate((static_all, visits_all_hmm), axis=1)

# split the data into training and test
train_idx = np.random.choice(range(0, nsamples), size=np.round(nsamples * 0.7, 0), replace=False)
test_idx = list(set(range(0, nsamples)) - set(train_idx))

# train hybrid on features enriched by HMM (3)
rf = RandomForestClassifier(n_estimators=nestimators)
rf.fit(enriched_by_hmm[train_idx], labels_all[train_idx])
print "===> (3) Hybrid (RF) on features (state visit counts) enriched by HMM: %.4f" % rf.score(
labels_all = np.concatenate((labels_train, labels_val), axis=0)
nsamples = static_all.shape[0]

# prepare where to store the ratios
ratios_all_hmm = np.empty(len(labels_all))

# split indices into folds
enrich_idx_list = np.array_split(range(nsamples), nfolds)

# CV for feature enrichment
for fid, enrich_idx in enumerate(enrich_idx_list):
    print "Current fold is %d / %d" % (fid + 1, nfolds)
    train_idx = list(set(range(nsamples)) - set(enrich_idx))

    # extract predictions using HMM on dynamic
    hmmcl = HMMClassifier()
    model_pos, model_neg = hmmcl.train(nhmmstates, nhmmiter, hmmcovtype, 
                                       dynamic_all[train_idx], labels_all[train_idx])
    ratios_all_hmm[enrich_idx] = hmmcl.pos_neg_ratios(model_pos, model_neg, dynamic_all[enrich_idx])

# dataset for hybrid learning
enriched_by_hmm = np.concatenate((static_all, np.matrix(ratios_all_hmm).T), axis=1)

# CV for accuracy estimation
val_idx_list = np.array_split(range(nsamples), nfolds)
scores = []
for fid, val_idx in enumerate(val_idx_list):
    print "Current fold is %d / %d" % (fid + 1, nfolds)
    train_idx = list(set(range(nsamples)) - set(val_idx))
    
    # Hybrid on features enriched by HMM (3)
Пример #18
0
# Train enrichment models on trainA
#
print 'Training enrichment models...'

# extract predictions using RF on static
rf = RandomForestClassifier(n_estimators=nestimators)
rf.fit(trainA_static, trainA_labels)
predictions_trainB_rf = rf.predict_log_proba(trainB_static)
predictions_trainB_rf[predictions_trainB_rf == -inf] = np.min(
    predictions_trainB_rf[predictions_trainB_rf != -inf])
predictions_test_rf = rf.predict_log_proba(test_static)
predictions_test_rf[predictions_test_rf == -inf] = np.min(
    predictions_test_rf[predictions_test_rf != -inf])

# extract predictions using HMM on dynamic
hmmcl = HMMClassifier()
model_pos, model_neg = hmmcl.train(nhmmstates, nhmmiter, hmmcovtype,
                                   trainA_dynamic, trainA_labels)
predictions_trainB_hmm = hmmcl.predict_log_proba(model_pos, model_neg,
                                                 trainB_dynamic)
ratios_trainB_hmm = hmmcl.pos_neg_ratios(model_pos, model_neg, trainB_dynamic)
predictions_test_hmm = hmmcl.predict_log_proba(model_pos, model_neg,
                                               test_dynamic)
ratios_test_hmm = hmmcl.pos_neg_ratios(model_pos, model_neg, test_dynamic)

# extract predictions using LSTM on dynamic
lstmcl = LSTMClassifier(lstmsize, lstmdropout, lstmoptim, lstmnepochs,
                        lstmbatchsize)
model_pos, model_neg = lstmcl.train(trainA_dynamic, trainA_labels)
mse_pos, mse_neg = lstmcl.predict_mse(model_pos, model_neg, trainB_dynamic)
predictions_trainB_lstm = np.vstack((mse_pos, mse_neg)).T
    "/storage/hpc_anna/GMiC/Data/ECoG/preprocessed/train_labels.npy")
test_data = np.load(
    "/storage/hpc_anna/GMiC/Data/ECoG/preprocessed/test_data.npy")
test_labels = np.load(
    "/storage/hpc_anna/GMiC/Data/ECoG/preprocessed/test_labels.npy")

# split the training data into two halves
#   fh stands for first half
#   sh stands for second half
print "Splitting data in two halves..."
fh_data, fh_labels, sh_data, sh_labels = DataHandler.split(
    0.5, train_data, train_labels)

# train HMM on first 50% of the training set
print "Training HMM classifier..."
hmmcl = HMMClassifier()
model_pos, model_neg = hmmcl.train(NSTATES, NITERS, fh_data, fh_labels)

# feed second 50% of the training set into the HMM to obtain
# pos/neg ratio for every sequence in the second half of the training set
print "Extracting ratios based on the HMM model..."
sh_ratios = hmmcl.pos_neg_ratios(model_pos, model_neg, sh_data)
test_ratios = hmmcl.pos_neg_ratios(model_pos, model_neg, test_data)

# apply fourier transform on the second 50% of the training set
print "Fouriering the second half of the dataset..."
fourier_sh_data = Fourier.data_to_fourier(sh_data)
fourier_test_data = Fourier.data_to_fourier(test_data)

# augment fourier results of the second 50% train with the ratios thus producing an enriched dataset
print "Merging Fourier features and HMM-based ratios..."
"""

Generate HMM-based classifier on syn_lstm_wins synthetic dataset

"""

import numpy as np
from HMM.hmm_classifier import HMMClassifier

print 'Loading the dataset..'
train_data = np.load(
    "/storage/hpc_anna/GMiC/Data/syn_lstm_wins/train_dynamic.npy")
train_labels = np.load(
    "/storage/hpc_anna/GMiC/Data/syn_lstm_wins/train_labels.npy")
test_data = np.load(
    "/storage/hpc_anna/GMiC/Data/syn_lstm_wins/test_dynamic.npy")
test_labels = np.load(
    "/storage/hpc_anna/GMiC/Data/syn_lstm_wins/test_labels.npy")

print "Training HMM classifier..."
hmmcl = HMMClassifier()
model_pos, model_neg = hmmcl.train(2, 2, 'full', train_data, train_labels)
print hmmcl.test(model_pos, model_neg, test_data, test_labels)
# run CV
for fid, predict_idx in enumerate(predict_idx_list):
    print "Current fold is %d" % fid
    train_idx = list(set(range(nsamples)) - set(predict_idx))
    
    # extract predictions using RF on static
    print "    Extracting predictions on static data with RF..."
    rf = RandomForestClassifier(n_estimators=nestimators)
    rf.fit(static_all[train_idx], labels_all[train_idx])
    predictions_all_rf[predict_idx] = rf.predict_log_proba(static_all[predict_idx])
    predictions_all_rf[predictions_all_rf == -inf] = np.min(predictions_all_rf[predictions_all_rf != -inf])

    # extract predictions using HMM on dynamic
    print "    Extracting predictions on dynamic data with HMM..."
    hmmcl = HMMClassifier()
    model_pos, model_neg = hmmcl.train(nhmmstates, nhmmiter, hmmcovtype, 
                                       dynamic_all[train_idx], labels_all[train_idx])
    predictions_all_hmm[predict_idx] = hmmcl.predict_log_proba(model_pos, model_neg, dynamic_all[predict_idx])
    ratios_all_hmm[predict_idx] = hmmcl.pos_neg_ratios(model_pos, model_neg, dynamic_all[predict_idx])

    # extract predictions using LSTM on dynamic
    print "    Extracting predictions on dynamic data with LSTM..."
    lstmcl = LSTMClassifier(lstmsize, lstmdropout, lstmoptim, lstmnepochs, lstmbatchsize)
    model_pos, model_neg = lstmcl.train(dynamic_all[train_idx], labels_all[train_idx])
    mse_pos, mse_neg = lstmcl.predict_mse(model_pos, model_neg, dynamic_all[predict_idx]) 
    predictions_all_lstm[predict_idx] = np.vstack((mse_pos, mse_neg)).T
    ratios_all_lstm[predict_idx] = lstmcl.pos_neg_ratios(model_pos, model_neg, dynamic_all[predict_idx])


#
# prepare where to store the ratios
ratios_all_hmm = np.empty(len(labels_all))
predictions_all_hmm = np.empty((len(labels_all), 2))
predictions_all = np.empty((len(labels_all), ))

# split indices into folds
enrich_idx_list = np.array_split(range(nsamples), nfolds)

# run CV for enrichment
for fid, enrich_idx in enumerate(enrich_idx_list):
    print "Current fold is %d/%d" % (fid + 1, nfolds)
    train_idx = list(set(range(nsamples)) - set(enrich_idx))

    # extract predictions using HMM on dynamic
    hmmcl = HMMClassifier()
    model_pos, model_neg = hmmcl.train(nhmmstates, nhmmiter, hmmcovtype, 
                                       dynamic_all[train_idx], labels_all[train_idx])
    ratios_all_hmm[enrich_idx] = hmmcl.pos_neg_ratios(model_pos, model_neg, dynamic_all[enrich_idx])
    predictions_all_hmm[enrich_idx] = hmmcl.predict_log_proba(model_pos, model_neg, dynamic_all[enrich_idx])
    predictions_all[enrich_idx] = hmmcl.predict(hmmcl.tensor_to_list(dynamic_all[enrich_idx]), model_pos, model_neg)
 
# dataset for hybrid learning
dynamic_as_static = dynamic_all.reshape((dynamic_all.shape[0], dynamic_all.shape[1] * dynamic_all.shape[2]))
enriched_by_hmm = np.concatenate((dynamic_as_static, predictions_all_hmm), axis=1)


# k-fold cross validation to obtain accuracy
print '===> HMM on dynamic: %.4f' % hmmcl.accuracy(predictions_all, labels_all)

val_idx_list = np.array_split(range(nsamples), nfolds)
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 22 17:58:39 2015

@author: annaleontjeva
"""
from HMM.hmm_classifier import HMMClassifier
from DataNexus.datahandler import DataHandler
import numpy as np

train_data = np.load("/storage/hpc_anna/GMiC/Data/ECoG/preprocessed/train_data.npy")
train_labels = np.load("/storage/hpc_anna/GMiC/Data/ECoG/preprocessed/train_labels.npy")
test_data = np.load("/storage/hpc_anna/GMiC/Data/ECoG/preprocessed/test_data.npy")
test_labels = np.load("/storage/hpc_anna/GMiC/Data/ECoG/preprocessed/test_labels.npy")

hmmcl = HMMClassifier()
#model_pos, model_neg = hmmcl.train(20, 100, train_data, train_labels)
#print hmmcl.test_model(model_pos, model_neg, test_data, test_labels)
hmmcl.find_best_parameter(0.7, range(20,31), 10, 5, train_data, train_labels)
# parameters
nfolds = 5
nstates = 6
niter = 50

# load data
static_train = np.load('/storage/hpc_anna/GMiC/Data/ECoGmixed/fourier/train_data.npy')
dynamic_train = np.load('/storage/hpc_anna/GMiC/Data/ECoGmixed/preprocessed/train_data.npy')
static_val = np.load('/storage/hpc_anna/GMiC/Data/ECoGmixed/fourier/test_data.npy')
dynamic_val = np.load('/storage/hpc_anna/GMiC/Data/ECoGmixed/preprocessed/test_data.npy')
labels_train = np.load('/storage/hpc_anna/GMiC/Data/ECoGmixed/preprocessed/train_labels.npy')
labels_val = np.load('/storage/hpc_anna/GMiC/Data/ECoGmixed/preprocessed/test_labels.npy')
nsamples = dynamic_train.shape[0]

# split indices into folds
val_idx_list = np.array_split(range(nsamples), nfolds)

# run CV
scores = []
for fid, val_idx in enumerate(val_idx_list):
    print "Current fold is %d/%d" % (fid + 1, nfolds)
    train_idx = list(set(range(nsamples)) - set(val_idx))

    # train the model and report performance
    hmmcl = HMMClassifier()
    model_pos, model_neg = hmmcl.train(nstates, niter, 'full', dynamic_train[train_idx], labels_train[train_idx])
    scores.append(hmmcl.test(model_pos, model_neg, dynamic_train[val_idx], labels_train[val_idx]))

print "===> (7) HMM with dynamic features on CV: %.4f (+/- %.4f) %s" % (np.mean(scores), np.std(scores), scores)
def bpic(nhmmstates, nestimators, nhmmiter):

    nhmmstates = nhmmstates[0]
    nestimators = nestimators[0] * 100
    nhmmiter = nhmmiter[0] * 10
    nfolds = 5
    hmmcovtype = 'full'

    print nhmmstates, nestimators, nhmmiter

    # Load the dataset
    static_train = np.load(
        '/storage/hpc_anna/GMiC/Data/BPIChallenge/f1/preprocessed/train_static.npy'
    )
    dynamic_train = np.load(
        '/storage/hpc_anna/GMiC/Data/BPIChallenge/f1/preprocessed/train_dynamic.npy'
    )
    static_val = np.load(
        '/storage/hpc_anna/GMiC/Data/BPIChallenge/f1/preprocessed/test_static.npy'
    )
    dynamic_val = np.load(
        '/storage/hpc_anna/GMiC/Data/BPIChallenge/f1/preprocessed/test_dynamic.npy'
    )
    labels_train = np.load(
        '/storage/hpc_anna/GMiC/Data/BPIChallenge/f1/preprocessed/train_labels.npy'
    )
    labels_val = np.load(
        '/storage/hpc_anna/GMiC/Data/BPIChallenge/f1/preprocessed/test_labels.npy'
    )

    # Merge train and test
    static_all = np.concatenate((static_train, static_val), axis=0)
    dynamic_all = np.concatenate((dynamic_train, dynamic_val), axis=0)
    labels_all = np.concatenate((labels_train, labels_val), axis=0)
    nsamples = static_all.shape[0]

    # prepare where to store the ratios
    ratios_all_hmm = np.empty(len(labels_all))

    # split indices into folds
    enrich_idx_list = np.array_split(range(nsamples), nfolds)

    # run CV
    for fid, enrich_idx in enumerate(enrich_idx_list):
        train_idx = list(set(range(nsamples)) - set(enrich_idx))

        # extract predictions using HMM on dynamic
        hmmcl = HMMClassifier()
        model_pos, model_neg = hmmcl.train(nhmmstates, nhmmiter, hmmcovtype,
                                           dynamic_all[train_idx],
                                           labels_all[train_idx])
        ratios_all_hmm[enrich_idx] = hmmcl.pos_neg_ratios(
            model_pos, model_neg, dynamic_all[enrich_idx])

    # dataset for hybrid learning
    enriched_by_hmm = np.concatenate((static_all, np.matrix(ratios_all_hmm).T),
                                     axis=1)

    # (2.) k-fold cross validation to obtain accuracy
    val_idx_list = np.array_split(range(nsamples), nfolds)
    scores = []
    for fid, val_idx in enumerate(val_idx_list):
        train_idx = list(set(range(nsamples)) - set(val_idx))

        # Hybrid on features enriched by HMM (3)
        rf = RandomForestClassifier(n_estimators=nestimators)
        rf.fit(enriched_by_hmm[train_idx], labels_all[train_idx])
        scores.append(rf.score(enriched_by_hmm[val_idx], labels_all[val_idx]))

    print 'Result: %.4f' % np.mean(scores)
    return -np.mean(scores)
Пример #26
0
)
labels_val = np.load(
    '/storage/hpc_anna/GMiC/Data/BPIChallenge/f1/preprocessed/test_labels.npy')

# sclare overly large values
#dynamic_train[:, 1, :] = dynamic_train[:, 1, :] / 1000000000.0
#dynamic_val[:, 1, :] = dynamic_val[:, 1, :] / 1000000000.0

# static data with RF
rf = RandomForestClassifier(n_estimators=rf_estimators, n_jobs=-1)
rf.fit(static_train, labels_train)
print "Random Forest with static features on validation set: %.4f" % rf.score(
    static_val, labels_val)

# dynamic data with HMM
hmmcl = HMMClassifier()
model_pos, model_neg = hmmcl.train(3, 10, dynamic_train, labels_train)
print "HMM with dynamic features on validation set: %.4f" % hmmcl.test(
    model_pos, model_neg, dynamic_val, labels_val)

# dynamic data with RF
print "Training RF on the dynamic dataset..."
dynamic_as_static_train = dynamic_train.reshape(
    (dynamic_train.shape[0], dynamic_train.shape[1] * dynamic_train.shape[2]))
dynamic_as_static_val = dynamic_val.reshape(
    (dynamic_val.shape[0], dynamic_val.shape[1] * dynamic_val.shape[2]))
rf = RandomForestClassifier(n_estimators=rf_estimators, n_jobs=-1)
rf.fit(dynamic_as_static_train, labels_train)
print "RF with dynamic features on validation set: %.4f" % rf.score(
    dynamic_as_static_val, labels_val)
print 'Loading the dataset..'
static_train = np.load('/storage/hpc_anna/GMiC/Data/syn_multisame/train_static.npy')
dynamic_train = np.load('/storage/hpc_anna/GMiC/Data/syn_multisame/train_dynamic.npy')
static_val = np.load('/storage/hpc_anna/GMiC/Data/syn_multisame/test_static.npy')
dynamic_val = np.load('/storage/hpc_anna/GMiC/Data/syn_multisame/test_dynamic.npy')
labels_train = np.load('/storage/hpc_anna/GMiC/Data/syn_multisame/train_labels.npy')
labels_val = np.load('/storage/hpc_anna/GMiC/Data/syn_multisame/test_labels.npy')

#
# Evaluating Joint Model
#
print "Evaluating joint model:"
print "Splitting data in two halves..."
fh_idx = np.random.choice(range(0, dynamic_train.shape[0]), size=np.round(dynamic_train.shape[0] * 0.5, 0), replace=False)
sh_idx = list(set(range(0, dynamic_train.shape[0])) - set(fh_idx))
fh_data = dynamic_train[fh_idx, :, :]
fh_labels = labels_train[fh_idx]
sh_data = dynamic_train[sh_idx, :, :]
sh_labels = labels_train[sh_idx]

print "Training HMM classifier..."
hmmcl = HMMClassifier()
model_pos, model_neg = hmmcl.train(3, 10, fh_data, fh_labels)
print model_pos.startprob, model_pos.transmat, model_pos.means, model_pos.covars
print model_neg.startprob, model_neg.transmat, model_neg.means, model_neg.covars





Пример #28
0
# run CV
for fid, predict_idx in enumerate(predict_idx_list):
    print "Current fold is %d" % fid
    train_idx = list(set(range(nsamples)) - set(predict_idx))
    
    # extract predictions using RF on static
    print "    Extracting predictions on static data with RF..."
    rf = RandomForestClassifier(n_estimators=nestimators)
    rf.fit(static_all[train_idx], labels_all[train_idx])
    predictions_all_rf[predict_idx] = rf.predict_log_proba(static_all[predict_idx])
    predictions_all_rf[predictions_all_rf == -inf] = np.min(predictions_all_rf[predictions_all_rf != -inf])

    # extract predictions using HMM on dynamic
    print "    Extracting predictions on dynamic data with HMM..."
    hmmcl = HMMClassifier()
    model_pos, model_neg = hmmcl.train(nhmmstates, nhmmiter, hmmcovtype, 
                                       dynamic_all[train_idx], labels_all[train_idx])
    predictions_all_hmm[predict_idx] = hmmcl.predict_log_proba(model_pos, model_neg, dynamic_all[predict_idx])
    ratios_all_hmm[predict_idx] = hmmcl.pos_neg_ratios(model_pos, model_neg, dynamic_all[predict_idx])

    # extract predictions using LSTM on dynamic
    print "    Extracting predictions on dynamic data with LSTM..."
    lstmcl = LSTMClassifier(lstmsize, lstmdropout, lstmoptim, lstmnepochs, lstmbatchsize)
    model_pos, model_neg = lstmcl.train(dynamic_all[train_idx], labels_all[train_idx])
    mse_pos, mse_neg = lstmcl.predict_mse(model_pos, model_neg, dynamic_all[predict_idx]) 
    predictions_all_lstm[predict_idx] = np.vstack((mse_pos, mse_neg)).T
    ratios_all_lstm[predict_idx] = lstmcl.pos_neg_ratios(model_pos, model_neg, dynamic_all[predict_idx])


#
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 22 17:58:39 2015

@author: annaleontjeva
"""

from HMM.hmm_classifier import HMMClassifier
import numpy as np

# load the dataset
train_data = np.load("/storage/hpc_anna/GMiC/Data/ECoGmixed/preprocessed/train_data.npy")
train_labels = np.load("/storage/hpc_anna/GMiC/Data/ECoGmixed/preprocessed/train_labels.npy")
test_data = np.load("/storage/hpc_anna/GMiC/Data/ECoGmixed/preprocessed/test_data.npy")
test_labels = np.load("/storage/hpc_anna/GMiC/Data/ECoGmixed/preprocessed/test_labels.npy")

# TEMPORARY FOR TESTING PURPOSES
#train_data = train_data[:, 0:2, :]
#test_data = test_data[:, 0:2, :]

# initialize HMM based classifier
hmmcl = HMMClassifier()

# train a model pair for each feature
models_pos, models_neg = hmmcl.train_per_feature(3, 10, train_data, train_labels)

# show accuracy on the test set
print hmmcl.test_per_feature(models_pos, models_neg, test_data, test_labels)

    # k-fold CV for enrichment

    # prepare where to store the ratios
    ratios_all_hmm = np.empty(len(labels_all))
    ratios_all_lstm = np.empty(len(labels_all))

    # split indices into folds
    predict_idx_list = np.array_split(range(nsamples), nfolds)

    # run CV
    for fid, predict_idx in enumerate(predict_idx_list):
        print "Enrichment fold %d / %d" % (fid + 1, nfolds)
        train_idx = list(set(range(nsamples)) - set(predict_idx))

        # extract predictions using HMM on dynamic
        hmmcl = HMMClassifier()
        model_pos, model_neg = hmmcl.train(nhmmstates, nhmmiter, hmmcovtype,
                                           dynamic_all[train_idx],
                                           labels_all[train_idx])
        ratios_all_hmm[predict_idx] = hmmcl.pos_neg_ratios(
            model_pos, model_neg, dynamic_all[predict_idx])

        # extract predictions using LSTM on dynamic
        lstmcl = LSTMClassifier(lstmsize, lstmdropout, lstmoptim, lstmnepochs,
                                lstmbatchsize)
        model_pos, model_neg = lstmcl.train(dynamic_all[train_idx],
                                            labels_all[train_idx])
        mse_pos, mse_neg = lstmcl.predict_mse(model_pos, model_neg,
                                              dynamic_all[predict_idx])
        ratios_all_lstm[predict_idx] = lstmcl.pos_neg_ratios(
            model_pos, model_neg, dynamic_all[predict_idx])
#print dynamic_val_labels.shape


#
# Sanity Checks
#

print "Error ratio: %.2f, expected performance of a lonely model is %.2f, of the joint model %.2f" % (error_ratio, 1 - error_ratio, 1 - error_ratio + error_ratio / 2.0)

# a) static data classification
rf = RandomForestClassifier(n_estimators=100)
rf.fit(static_train_data, static_train_labels)
print "Random Forest with static features on validation set: %.4f" % rf.score(static_val_data, static_val_labels)

# b) dynamic data classification
hmmcl = HMMClassifier()
model_pos, model_neg = hmmcl.train(3, 10, dynamic_train_data, dynamic_train_labels)
print "HMM with dynamic features on validation set: %.4f" % hmmcl.test(model_pos, model_neg, dynamic_val_data, dynamic_val_labels)


#
# Evaluating Joint Model
#

print ""
print "Evaluating joint model:"
print "Splitting data in two halves..."
fh_idx = np.random.choice(range(0, dynamic_train_data.shape[0]), size=np.round(dynamic_train_data.shape[0] * 0.5, 0), replace=False)
sh_idx = list(set(range(0, dynamic_train_data.shape[0])) - set(fh_idx))
fh_data = dynamic_train_data[fh_idx, :, :]
fh_labels = dynamic_train_labels[fh_idx]
# run CV
for fid, predict_idx in enumerate(predict_idx_list):
    print "Current fold is %d" % fid
    train_idx = list(set(range(nsamples)) - set(predict_idx))
    
    # extract predictions using RF on static
    print "    Extracting predictions on static data with RF..."
    rf = RandomForestClassifier(n_estimators=nestimators)
    rf.fit(static_all[train_idx], labels_all[train_idx])
    predictions_all_rf[predict_idx] = rf.predict_log_proba(static_all[predict_idx])
    predictions_all_rf[predictions_all_rf == -inf] = np.min(predictions_all_rf[predictions_all_rf != -inf])

    # extract predictions using HMM on dynamic
    print "    Extracting predictions on dynamic data with HMM..."
    hmmcl = HMMClassifier()
    model_pos, model_neg = hmmcl.train(nhmmstates, nhmmiter, hmmcovtype, 
                                       dynamic_all[train_idx], labels_all[train_idx])
    predictions_all_hmm[predict_idx] = hmmcl.predict_log_proba(model_pos, model_neg, dynamic_all[predict_idx])
    ratios_all_hmm[predict_idx] = hmmcl.pos_neg_ratios(model_pos, model_neg, dynamic_all[predict_idx])

    # extract predictions using LSTM on dynamic
    print "    Extracting predictions on dynamic data with LSTM..."
    lstmcl = LSTMClassifier(lstmsize, lstmdropout, lstmoptim, lstmnepochs)
    model_pos, model_neg = lstmcl.train(dynamic_all[train_idx], labels_all[train_idx])
    mse_pos, mse_neg = lstmcl.predict_mse(model_pos, model_neg, dynamic_all[predict_idx]) 
    predictions_all_lstm[predict_idx] = np.vstack((mse_pos, mse_neg)).T
    ratios_all_lstm[predict_idx] = lstmcl.pos_neg_ratios(model_pos, model_neg, dynamic_all[predict_idx])


#
labels_val = np.load(
    '/storage/hpc_anna/GMiC/Data/syn_lstm_wins/test_labels.npy')

#
# Sanity Checks
#
print "Expected performance of a lonely model is 0.75, of the joint model 1.0"

# static data with RF
rf = RandomForestClassifier(n_estimators=100)
rf.fit(static_train, labels_train)
print "Random Forest with static features on validation set: %.4f" % rf.score(
    static_val, labels_val)

# dynamic data with HMM
hmmcl = HMMClassifier()
model_pos, model_neg = hmmcl.train(3, 10, dynamic_train, labels_train)
print "HMM with dynamic features on validation set: %.4f" % hmmcl.test(
    model_pos, model_neg, dynamic_val, labels_val)

# dynamic data with LSTM
lstmcl = LSTMClassifier(2000, 0.5, 'adagrad', 20)
model_pos, model_neg = lstmcl.train(dynamic_train, labels_train)
print "LSTM with dynamic features on validation set: %.4f" % lstmcl.test(
    model_pos, model_neg, dynamic_val, labels_val)

# dynamic data with RF
print "Training RF on the dynamic dataset..."
dynamic_as_static_train = dynamic_train.reshape(
    (dynamic_train.shape[0], dynamic_train.shape[1] * dynamic_train.shape[2]))
dynamic_as_static_val = dynamic_val.reshape(
Пример #34
0
#
# Prepare combined datasets for the future experiments
#

# dataset to check how generative models perform if provided with static features along with dynamic
static_as_dynamic = np.zeros((static_all.shape[0], static_all.shape[1], dynamic_all.shape[2]))
for i in range(static_all.shape[0]):
    static_as_dynamic[i, :, :] = np.tile(static_all[i, :], (dynamic_all.shape[2], 1)).T
dynamic_and_static_as_dynamic = np.concatenate((dynamic_all, static_as_dynamic + np.random.uniform(-0.0001, 0.0001, static_as_dynamic.shape)), axis=1)


#
# k-fold CV for performance estimation
#
val_idx_list = np.array_split(range(nsamples), nfolds)
scores = {1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: [], 11: []}
for fid, val_idx in enumerate(val_idx_list):
    print "Current fold is %d / %d" % (fid + 1, nfolds)
    train_idx = list(set(range(nsamples)) - set(val_idx))

    # HMM on dynamic and static (turned into fake sequences) (9)
    hmmcl = HMMClassifier()
    model_pos, model_neg = hmmcl.train(nhmmstates, nhmmiter, hmmcovtype, dynamic_and_static_as_dynamic[train_idx], labels_all[train_idx])
    acc, auc = hmmcl.test(model_pos, model_neg, dynamic_and_static_as_dynamic[val_idx], labels_all[val_idx])
    scores[9].append(acc)

print "===> (9) HMM on dynamic and static features: %.4f (+/- %.4f) %s" % (np.mean(scores[9]), np.std(scores[9]), scores[9])


for fid, predict_idx in enumerate(predict_idx_list):
    print "Current fold is %d" % fid
    train_idx = list(set(range(nsamples)) - set(predict_idx))

    # extract predictions using RF on static
    print "    Extracting predictions on static data with RF..."
    rf = RandomForestClassifier(n_estimators=nestimators)
    rf.fit(static_all[train_idx], labels_all[train_idx])
    predictions_all_rf[predict_idx] = rf.predict_log_proba(
        static_all[predict_idx])
    predictions_all_rf[predictions_all_rf == -inf] = np.min(
        predictions_all_rf[predictions_all_rf != -inf])

    # extract predictions using HMM on dynamic
    print "    Extracting predictions on dynamic data with HMM..."
    hmmcl = HMMClassifier()
    model_pos, model_neg = hmmcl.train(nhmmstates, nhmmiter, hmmcovtype,
                                       dynamic_all[train_idx],
                                       labels_all[train_idx])
    predictions_all_hmm[predict_idx] = hmmcl.predict_log_proba(
        model_pos, model_neg, dynamic_all[predict_idx])
    ratios_all_hmm[predict_idx] = hmmcl.pos_neg_ratios(
        model_pos, model_neg, dynamic_all[predict_idx])

    # extract predictions using LSTM on dynamic
    print "    Extracting predictions on dynamic data with LSTM..."
    lstmcl = LSTMClassifier(lstmsize, lstmdropout, lstmoptim, lstmnepochs,
                            lstmbatchsize)
    model_pos, model_neg = lstmcl.train(dynamic_all[train_idx],
                                        labels_all[train_idx])
    mse_pos, mse_neg = lstmcl.predict_mse(model_pos, model_neg,