Exemplo n.º 1
0
def run():
    embedding_wrapper = EmbeddingWrapper('product')
    bc = BasketConstructor('./data/', './data/')
    ub_basket = bc.get_baskets('prior', reconstruct=False)
    ok, ub_basket = train_test_split(ub_basket, test_size=0.20, random_state=0)
    #embedding_wrapper = EmbeddingWrapper('tafeng_products')
    print(ub_basket)

    all_baskets = ub_basket.basket.values
    print(all_baskets)
    #changes every item to string
    print("nested change")
    all_baskets = nested_change(list(all_baskets), str)
    print("embedding_wrapper.remove_products_wo_embeddings(all_baskets)")
    all_baskets = embedding_wrapper.remove_products_wo_embeddings(all_baskets)
    print("uncommon products")
    all_baskets = remove_products_which_are_uncommon(all_baskets)
    print("short baskets")
    medium_baskets, all_baskets = remove_short_baskets(all_baskets)
    print(medium_baskets , all_baskets)
    print("nested change")
    all_baskets = nested_change(all_baskets, embedding_wrapper.lookup_ind_f)
    print("split_data")
    train_ub, val_ub_input, val_ub_target, test_ub_input, test_ub_target = split_data(all_baskets)
    print('knndtw')
    knndtw = KnnDtw(n_neighbors=[5])
    preds_all, distances = knndtw.predict(train_ub, val_ub_input, embedding_wrapper.basket_dist_EMD, 
                                          embedding_wrapper.basket_dist_REMD)
    print(preds_all)
    print(distances)
    #print("Wasserstein distance", sum(distances)/len(distances))
    return preds_all, distances
Exemplo n.º 2
0
def k_fold_cross_val(k_list, train, label, folds):

    #Randomly shuffle the data and label in to the same sequence
    seed = np.arange(train.shape[0])
    np.random.shuffle(seed)
    train = train[seed]
    label = label[seed]
    #Keep track of the score for this k value, num of scores = num of folds
    k_scores = []  #averaged scores for each k value, num of scores = num of K

    #we want to split train data into test and train
    label_name = {
        1: 'Hover',
        2: 'Impact (Front Left)',
        3: 'Impact (Front Right)',
        4: 'Impact (Back Left)',
        5: 'Impact (Back Right)',
        6: 'Gust (from Left)',
        7: 'Gust (from Right)',
        8: 'Gust (from front)'
    }
    clf = KnnDtw(n_neighbors=1, max_warping_window=100)  #Initialize classifier
    kf = KFold(n_splits=folds)
    kf.get_n_splits(train)
    for K in k_list:
        scores = [
        ]  #averaged scores for each k value, num of scores = num of K
        clf = KnnDtw(n_neighbors=K, max_warping_window=100)
        for train_index, test_index in kf.split(train):
            print("TRAIN:", train_index, "TEST:", test_index)
            X_train, X_test = train[train_index], train[test_index]
            y_train, y_test = label[train_index], label[test_index]
            clf_copy = copy.deepcopy(
                clf
            )  #try to make sure the estimator is reset before each fit, but maybe I can just move clf into the loop?
            clf_copy.fit(X_train, y_train)
            labels, proba = clf_copy.predict(X_test)
            #print(classification_report(labels, y_test,target_names=[l for l in label_name.values()]))
            acc = accuracy_score(y_test, labels)
            print('Accuracy for this fold is:', acc)
            scores.append(acc)
        scores = np.array(scores)  #convert the fold scores array into numpy
        score = np.average(
            scores)  #averages the fold scores to a single socre for the k
        k_scores.append(score)
    #Plot the average accuracy score for each k, recommend a besk (highest accuracy) k
    k_best = k_list[np.argmax(k_scores)]

    plt.bar(k_list, k_scores, width=0.2)
    plt.xlabel('k (nearest neighbors)')
    plt.ylabel('Accuracy (average)')
    plt.xticks(k_list)
    print('Best k value from list is:', k_best)
    return k_best
Exemplo n.º 3
0
def machine_learning(x_test_final, x_train_final, y_train_final, labels,
                     param_weights, para_weights, param_list, k_value):
    param_labels = []
    for i in range(0, len(x_train_final)):
        #Analyze dataset

        x_train_final2 = np.array(x_train_final[i])
        y_train_final2 = np.array(y_train_final[i])
        x_test_final2 = np.array(x_test_final[i])

        m = KnnDtw(n_neighbors=k_value, max_warping_window=100)
        m.fit(x_train_final2, y_train_final2)
        label, proba = m.predict(x_test_final2)
        #get the weight for this parameter
        if param_weights == None:
            param_labels.append(label)  #if we don't have weights do this
        else:
            weight = [para_weights[param_list[i]]]
            param_labels.append(list(zip(
                label, weight * len(label))))  #a tuple list of (label, weight)

    param_labels = np.array(param_labels)
    if param_weights == None:
        para_mode, para_count = stats.mode(param_labels)
        para_mode = np.reshape(para_mode, (para_mode.shape[1], ))
    else:  #for weights
        para_mode = [0] * param_labels.shape[1]
        for i in range(param_labels.shape[1]):
            mode_count = [0] * len(
                labels
            )  #an array representing how frequent each label was used to classify a time series
            col = param_labels[:, i]
            for p in col:
                mode_count[p[0] - 1] += p[1]
            para_mode[i] = mode_count.index(
                max(mode_count)
            ) + 1  #the the label that was used most frequently as the overall label
            #para_mode = np.reshape(para_mode,(para_mode.shape[1],))

    #Using mode to see which classification was the most frequent for each data from all parameters used
    #k_val = list(range(1,11))
    #k_fold_cross_val(k_val,x_train,y_train,6)
    return param_labels, para_mode
Exemplo n.º 4
0
def run():
    embedding_wrapper = EmbeddingWrapper('product')
    bc = BasketConstructor('./data/', './data/')
    ub_basket = bc.get_baskets('prior', reconstruct=False)

    all_baskets = ub_basket.basket.values
    all_baskets = nested_change(list(all_baskets), str)

    all_baskets = embedding_wrapper.remove_products_wo_embeddings(all_baskets)
    all_baskets = remove_products_which_are_uncommon(all_baskets)
    all_baskets = remove_short_baskets(all_baskets)
    all_baskets = nested_change(all_baskets, embedding_wrapper.lookup_ind_f)

    train_ub, val_ub_input, val_ub_target, test_ub_input, test_ub_target = split_data(
        all_baskets)

    knndtw = KnnDtw(n_neighbors=[5])
    preds_all, distances = knndtw.predict(train_ub, val_ub_input,
                                          embedding_wrapper.basket_dist_EMD,
                                          embedding_wrapper.basket_dist_REMD)
    return preds_all, distances
Exemplo n.º 5
0
def param_ranking(param_list, k_val, warp_val, datapath, avg_type):
    start_time = time.time()
    p = []
    r = []
    f = []
    for dataparam in param_list:
        trainingdatafile = datapath + 'train_' + dataparam + '.txt'
        traininglabelfile = datapath + 'train_labels.txt'

        testdatafile = datapath + 'test_' + dataparam + '.txt'
        testlabelfile = datapath + 'test_labels.txt'

        # Open training data file, x:data, y:label
        x_train_file = open(trainingdatafile, 'r')
        y_train_file = open(traininglabelfile, 'r')

        #Open test data file, x:data, y:label
        x_test_file = open(testdatafile, 'r')
        y_test_file = open(testlabelfile, 'r')

        # Create empty lists
        x_train = []
        y_train = []
        x_test = []
        y_test = []

        # Mapping table for classes
        labels = {
            1: 'Hover',
            2: 'Impact (Front Left)',
            3: 'Impact (Front Right)',
            4: 'Impact (Back Left)',
            5: 'Impact (Back Right)',
            6: 'Gust (from Left)',
            7: 'Gust (from Right)',
            8: 'Gust (from front)'
        }

        i = 0
        # Loop through datasets
        for x in x_train_file:
            x_train.append([float(ts) for ts in x.split()])
        for y in y_train_file:
            y_train.append(int(y.rstrip('\n')))

        for x in x_test_file:
            x_test.append([float(ts) for ts in x.split()])

        for y in y_test_file:
            y_test.append(int(y.rstrip('\n')))

        #close data files
        x_train_file.close()
        y_train_file.close()
        x_test_file.close()
        y_test_file.close()

        # Convert to numpy for efficiency

        x_train = np.array(x_train)
        y_train = np.array(y_train)
        x_test = np.array(x_test)
        y_test = np.array(y_test)

        m = KnnDtw(n_neighbors=k_val, max_warping_window=warp_val)
        m.fit(x_train, y_train)
        label, proba = m.predict(x_test)

        precision, recall, f_score, _ = score(y_test, label, average=avg_type)
        p.append(precision)
        r.append(recall)
        f.append(f_score)

    precision_rank = sorted(list(zip(param_list, p)), key=lambda x: x[1])
    recall_rank = sorted(list(zip(param_list, r)), key=lambda x: x[1])
    fscore_rank = sorted(list(zip(param_list, f)), key=lambda x: x[1])
    #("Parameter rank by precision is:",precision_rank)
    print('Ranking for k = %s, max warping window = %s' % (k_val, warp_val))
    for rank in precision_rank[::-1]:
        print(rank[0], ": ", rank[1])
    #print("Parameter rank by recall is:",recall_rank)
    #print("Parameter rank by f-score is:",fscore_rank)
    print("--- %s seconds ---" %
          (time.time() - start_time))  #let's see how long this takes...
Exemplo n.º 6
0
#Plot Test data
if(plotdata):
    plt.figure(figsize=(11,7))
    colors = ['#D62728','#2C9F2C','#FD7F23','#1F77B4','#9467BD',
              '#8C564A','#7F7F7F','#1FBECF','#E377C2','#BCBD27']
    for i, r in enumerate([0,1,2,3,4,5,6,7,8,9]):
        plt.subplot(5,2,i+1)
        plt.plot(x_test[r], label=labels[y_test[r]], color=colors[i], linewidth=2)
        plt.xlabel('Samples @50Hz')
        plt.legend(loc='upper left')
        plt.tight_layout()
        

#Analyze dataset
m = KnnDtw(n_neighbors, max_warping_window)
m.fit(x_train,y_train)
label, proba = m.predict(x_test)

#Classification report
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(label, y_test,
                            target_names=[l for l in labels.values()]))


#Confusion Matrix
conf_mat = confusion_matrix(label, y_test)

fig = plt.figure(figsize=(8,8))
width = np.shape(conf_mat)[1]
height = np.shape(conf_mat)[0]
Exemplo n.º 7
0
from data import getData
import matplotlib.pyplot as plt
from knndtw import KnnDtw

matplotlib.use('TkAgg')

if __name__ == "__main__":

    k_range = 50
    accuracy, k_list = [], []
    range_list = [50, 100, 200, 300]
    duration_list = np.arange(0.5, 2.5, 0.5).tolist()
    sample_dict = getData(random_range=range_list,
                          DURATION_TO_EXAMINE=duration_list)

    classifier = KnnDtw(k_range, sample_dict, random_range=range_list)
    # dicts = classifier.get_pos_dict(50)
    # dicts2 = classifier.get_label_dict(50)
    # dict3 = classifier.get_n_neighbors_dict(50)
    acc = classifier.merge_view()

    # Drawing
    fig = plt.figure(figsize=(12, 4))
    for key in acc:
        plt.plot([k for k in range(1,
                                   len(acc[key]) + 1)], [s for s in acc[key]],
                 lw=1,
                 marker='o',
                 label=str(key))

    plt.legend()
Exemplo n.º 8
0
##  trainer.py
##
##  "Builds model.p using data found in .\data\training"
##

import os
import pickle

from numpy import array
from knndtw import KnnDtw
from imudata import IMUData

FALL = 1
NOT_FALL = 2

model = KnnDtw(
    subsample_step=5)  # change parameters here! using defaults currently

train_files = list()
for root, dirs, files in os.walk(r".\data\training\fall", topdown=False):
    for name in files:
        train_files.append(os.path.join(root, name))
labels = [FALL] * len(train_files)
for root, dirs, files in os.walk(r".\data\training\not fall", topdown=False):
    for name in files:
        train_files.append(os.path.join(root, name))
labels.extend([NOT_FALL] * (len(train_files) - len(labels)))

train_data = list()
for file in train_files:
    print("Reading", file)
    data = IMUData()
Exemplo n.º 9
0
freqs = np.load('processed/series_freqs.npy')
masses = np.load('processed/series_masses.npy')
labels = np.load('processed/series_labels.npy')

print(" ")
print("=============================")
print(" INSTANTIATING KNN-DTW MODEL ")
print("=============================")
print(" ")
print("Loading List of Neighbours...")
print("freqs:", freqs.shape)
print("masses:", masses.shape)
print("labels:", labels.shape)
print(" ")
freq_model = KnnDtw(n_neighbors=10, max_warping_window=10)
freq_model.fit(freqs, labels)
mass_model = KnnDtw(n_neighbors=10, max_warping_window=10)
mass_model.fit(masses, labels)

# Setup client socket
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect(('localhost', 8888))
# s2 = socket.socket(socket.AF_INET, socket.SOCK_STREAM) # Second connection
# s2.connect(('localhost', 8890))

s.send("series".encode())
parse = s.recv(1024).decode()
if parse == "yes" or "no":
    print("Authorisation Success.")
    print("Series Classifier Ready.")
Exemplo n.º 10
0
def multi_param_learn(param_list, param_weights, k_value, datapath):

    start_time = time.time()
    #this is the list that will store labels returned from each param before aggregating
    param_labels = []
    if param_weights != None:
        if len(param_list) != len(param_weights):
            raise Exception(
                'When using weights, there must one weight for each parameter!'
            )
        para_weights = dict(zip(param_list, param_weights))

    para_k = dict(zip(param_list, k_value))

    for dataparam in param_list:

        trainingdatafile = datapath + 'train_' + dataparam + '.txt'
        traininglabelfile = datapath + 'train_labels.txt'

        testdatafile = datapath + 'test_' + dataparam + '.txt'
        testlabelfile = datapath + 'test_labels.txt'

        # Open training data file, x:data, y:label
        x_train_file = open(trainingdatafile, 'r')
        y_train_file = open(traininglabelfile, 'r')

        #Open test data file, x:data, y:label
        x_test_file = open(testdatafile, 'r')
        y_test_file = open(testlabelfile, 'r')

        # Create empty lists
        x_train = []
        y_train = []
        x_test = []
        y_test = []

        # Mapping table for classes
        labels = {
            1: 'Hover',
            2: 'Impact (Front Left)',
            3: 'Impact (Front Right)',
            4: 'Impact (Back Left)',
            5: 'Impact (Back Right)',
            6: 'Gust (from Left)',
            7: 'Gust (from Right)',
            8: 'Gust (from front)'
        }

        i = 0
        # Loop through datasets
        for x in x_train_file:
            x_train.append([float(ts) for ts in x.split()])
        for y in y_train_file:
            y_train.append(int(y.rstrip('\n')))

        for x in x_test_file:
            x_test.append([float(ts) for ts in x.split()])

        for y in y_test_file:
            y_test.append(int(y.rstrip('\n')))

        #close data files
        x_train_file.close()
        y_train_file.close()
        x_test_file.close()
        y_test_file.close()

        # Convert to numpy for efficiency

        x_train = np.array(x_train)
        y_train = np.array(y_train)
        x_test = np.array(x_test)
        y_test = np.array(y_test)

        ##plot train data
        #plt.figure(figsize=(11,7))
        #colors = ['#D62728','#2C9F2C','#FD7F23','#1F77B4','#9467BD',
        #          '#8C564A','#7F7F7F','#1FBECF','#E377C2','#BCBD27',
        #          '#D62728','#2C9F2C']
        #for i, r in enumerate([0,1,2,3,5,6,7,8,9,10,11,12]):
        #    plt.subplot(7,2,i+1)
        #    plt.plot(x_train[r], label=labels[y_train[r]], color=colors[i], linewidth=2)
        #    plt.xlabel('Samples @50Hz')
        #    plt.legend(loc='upper left')
        #    plt.tight_layout()
        #
        ##Plot Test data
        #plt.figure(figsize=(11,7))
        #colors = ['#D62728','#2C9F2C','#FD7F23','#1F77B4','#9467BD',
        #          '#8C564A','#7F7F7F','#1FBECF','#E377C2','#BCBD27']
        #for i, r in enumerate([0,1,2,3,4,5]):
        #    plt.subplot(3,2,i+1)
        #    plt.plot(x_test[r], label=labels[y_test[r]], color=colors[i], linewidth=2)
        #    plt.xlabel('Samples @50Hz')
        #    plt.legend(loc='upper left')
        #    plt.tight_layout()

        #Analyze dataset
        print('Algorithm running for param %s with k value %i' %
              (dataparam, para_k[dataparam]))
        m = KnnDtw(n_neighbors=para_k[dataparam], max_warping_window=100)
        m.fit(x_train, y_train)
        label, proba = m.predict(x_test)
        #get the weight for this parameter
        if param_weights == None:
            param_labels.append(label)  #if we don't have weights do this
        else:
            weight = [para_weights[dataparam]]
            param_labels.append(list(zip(
                label, weight * len(label))))  #a tuple list of (label, weight)

    param_labels = np.array(param_labels)
    if param_weights == None:
        para_mode, para_count = stats.mode(param_labels)
        para_mode = np.reshape(para_mode, (para_mode.shape[1], ))
    else:  #for weights
        para_mode = [
            0
        ] * param_labels.shape[1]  #a zero array to represent final label value
        for i in range(param_labels.shape[1]):
            mode_count = [0] * len(
                labels
            )  #an array representing how frequent each label was used to classify a time series
            col = param_labels[:, i]
            for p in col:
                mode_count[int(p[0] - 1)] += p[1]
            para_mode[i] = mode_count.index(
                max(mode_count)
            ) + 1  #the the label that was used most frequently as the overall label
            #para_mode = np.reshape(para_mode,(para_mode.shape[1],))

    #Using mode to see which classification was the most frequent for each data from all parameters used
    #k_val = list(range(1,11))
    #k_fold_cross_val(k_val,x_train,y_train,6)

    #Classification report
    """ASSUMPTION: 
        We're trying to see accuracy of labelling as a result of multi param voting, but 
        we are only comparing to one y_test belonging to one (last) parameter with the current implementation
        we're assuming that y_test is the same across all param which builds on the assumption that
        train/test data for all param are from the same time period!
    """
    from sklearn.metrics import classification_report, confusion_matrix
    from sklearn.metrics import precision_recall_fscore_support as score
    print(
        classification_report(y_test,
                              para_mode,
                              target_names=[l for l in labels.values()]))

    #Confusion Matrix
    conf_mat = confusion_matrix(para_mode, y_test)

    fig = plt.figure(figsize=(8, 8))
    width = np.shape(conf_mat)[1]
    height = np.shape(conf_mat)[0]

    res = plt.imshow(np.array(conf_mat),
                     cmap=plt.cm.summer,
                     interpolation='nearest')
    for i, row in enumerate(conf_mat):
        for j, c in enumerate(row):
            if c > 0:
                plt.text(j - .2, i + .1, c, fontsize=16)

    #cb = fig.colorbar(res)
    plt.title('Confusion Matrix for ' +
              ', '.join([name for name in param_list]))
    plt.xlabel('Data')
    plt.ylabel('ML Identification')
    _ = plt.xticks(range(9), [l for l in labels.values()], rotation=90)
    _ = plt.yticks(range(9), [l for l in labels.values()])
    #print how long this function ran
    print("Runtime was %s seconds" % (time.time() - start_time))
Exemplo n.º 11
0
            y_test.append(int(y.rstrip('\n')))

        #close data files
        x_train_file.close()
        y_train_file.close()
        x_test_file.close()
        y_test_file.close()

        # Convert to numpy for efficiency
        x_train = np.array(x_train)
        y_train = np.array(y_train)
        x_test = np.array(x_test)
        y_test = np.array(y_test)

        #Analyze dataset
        m = KnnDtw(n_neighbors=3, max_warping_window=500)
        m.fit(x_train[::trainsample], y_train)
        label, proba = m.predict(x_test)

        #Classification report
        from sklearn.metrics import classification_report, confusion_matrix
        print(
            classification_report(label,
                                  y_test,
                                  target_names=[l for l in labels.values()]))

        #Confusion Matrix
        conf_mat = confusion_matrix(label, y_test)

        fig = plt.figure(figsize=(7, 7))
        width = np.shape(conf_mat)[1]
Exemplo n.º 12
0
            num += 1
            acc_num += 1
            recall += 1
            precision += 1
        if result == "falsepos":
            precision += 1
        if result == "trueneg":
            acc_num += 1
        if result == "falseneg":
            recall += 1
    return (acc_num / len(results), num / precision, num / recall)

FALL = 1
NOT_FALL = 2

model = KnnDtw()    # change parameters here! using defaults currently  

train_files = list()
scrapeNames(train_files, r".\data\training\fall")
labels = [FALL] * len(train_files)
scrapeNames(train_files, r".\data\training\not fall")
labels.extend([NOT_FALL] * (len(train_files) - len(labels)))

train_data = list()
for file in train_files:
    print("Reading", file)
    data = IMUData()
    with open(file, 'r') as f:
        for i, line in enumerate(f):
            data.append(line)
            if i == 112: