示例#1
0
def run(audio_db, prepared_dataset=None, new_song=None):
    # prepare data
    training_set, test_set = load_dataset(audio_db, new_song)
    if prepared_dataset:
        training_set = pickle.load(prepared_dataset)
        shuffle(training_set)
    print 'Train set: ' + repr(len(training_set))
    print 'Test set: ' + repr(len(test_set))
    # generate predictions
    predictions = []
    if not new_song:
        for x in range(len(test_set)):
            neighbors = get_neighbors(training_set, test_set[x], K_POINTS)
            result = predict(neighbors)
            predictions.append(result)
            print('> predicted=' + repr(result) + ', actual=' +
                  repr(test_set[x][-1]))
        accuracy = get_accuracy(test_set, predictions)  # check accuracy
        print 'Accuracy: ' + repr(accuracy)
        return dump_best_accuracy(
            dataset=training_set, accuracy=accuracy
        )  # Returns True If accuracy is bigger than threshold else False
    else:
        neighbors = get_neighbors(training_set, test_set[0], K_POINTS)
        return predict(neighbors)
示例#2
0
文件: structure.py 项目: zodman/cbr
def init():
    trainingdataset = process_dataset("out.csv")
    create_tables()
    insert_training(trainingdataset)
    #print "data %s" % Data.select().count()
    testdataset = process_dataset("test.csv")
    try:
        os.unlink("graphoutput.tx")
    except OSError:
        pass
    for data in testdataset:
        cat = replace_category(data)
        neighbors = get_neighbors(data, k=10)
        with open("graphoutput.txt", "a") as f:
            for i in neighbors:
                d = i.distance
                print i.data.pprint()
                f.write("%s\n" % d)
        result = get_response(neighbors)
        print "%s %s >>>>>>>>>>>>> prediction: %s" % (cat,data[:-1], result)
        yes_no = raw_input("Desea validar el valor (y/[n])? ")
        if yes_no is 'y':
            result = raw_input("nuevo valor 1 or 0: ")
            

        data[-1] = result
        insert_data(*data)
示例#3
0
    def get_weights_average_selected(x_train,
                                     dist_pair_mat,
                                     distance_algorithm='dtw'):
        # get the distance function

        dist_fun = utils.constants.DISTANCE_ALGORITHMS[distance_algorithm]

        # get the distance function params
        dist_fun_params = utils.constants.DISTANCE_ALGORITHMS_PARAMS[
            distance_algorithm]
        # get the number of dimenions
        num_dim = x_train[0].shape[1]
        # number of time series
        n = len(x_train)
        # maximum number of K for KNN
        max_k = 5
        # maximum number of sub neighbors
        max_subk = 2
        # get the real k for knn
        k = min(max_k, n - 1)
        # make sure
        subk = min(max_subk, k)
        # the weight for the center
        weight_center = 0.5
        # the total weight of the neighbors
        weight_neighbors = 0.3
        # total weight of the non neighbors
        weight_remaining = 1.0 - weight_center - weight_neighbors
        # number of non neighbors
        n_others = n - 1 - subk
        # get the weight for each non neighbor
        if n_others == 0:
            fill_value = 0.0
        else:
            fill_value = weight_remaining / n_others
        # choose a random time series
        idx_center = random.randint(0, n - 1)
        # get the init dba
        init_dba = x_train[idx_center]
        # init the weight matrix or vector for univariate time series
        weights = np.full((n, num_dim), fill_value, dtype=np.float64)
        # fill the weight of the center
        weights[idx_center] = weight_center
        # find the top k nearest neighbors
        topk_idx = np.array(
            get_neighbors(x_train,
                          init_dba,
                          k,
                          dist_fun,
                          dist_fun_params,
                          pre_computed_matrix=dist_pair_mat,
                          index_test_instance=idx_center))
        # select a subset of the k nearest neighbors
        final_neighbors_idx = np.random.permutation(k)[:subk]
        # adjust the weight of the selected neighbors
        weights[topk_idx[final_neighbors_idx]] = weight_neighbors / subk
        # return the weights and the instance with maximum weight (to be used as
        # init for DBA )
        return weights, init_dba
示例#4
0
def get_weights_average_selected(tseries, dist_pair_mat, distance_algorithm='DTW'):
    """
    Calculate weights with average selected method
    :param array tseries: the list of time series
    :param array dist_pair_mat: the distance matrix
    :param string distance_algorithm: the distance algorithm
    :return: the weights of each sequence and the medoid of the tseries
    """
    # get the number of the train set 
    n = len(tseries)
    # maximum number of K for KNN 
    max_k = 5
    # maximum number of sub neighbors 
    max_subk = 2
    # get the real k for knn 
    k = min(max_k, n - 1)
    # make sure 
    subk = min(max_subk, k)
    # the weight for the center 
    weight_center = 0.5 
    # the total weight of the neighbors
    weight_neighbors = 0.3
    # total weight of the non neighbors 
    weight_remaining = 1.0 - weight_center - weight_neighbors
    # number of non neighbors 
    n_others = n - 1 - subk
    # get the weight for each non neighbor 
    if n_others == 0 : 
        fill_value = 0.0
    else:
        fill_value = weight_remaining / n_others
    # choose a random time series 
    idx_center = random.randint(0, n - 1)
    # get the init dba 
    init_dba = tseries[idx_center]
    # init the weight matrix or vector for univariate time series 
    weights = np.full(n, fill_value, dtype=np.float64)
    # fill the weight of the center 
    weights[idx_center] = weight_center
    # find the top k nearest neighbors
    topk_idx = np.array(get_neighbors(tseries, init_dba, k, dist_fun, dist_fun_params,
                         pre_computed_matrix=dist_pair_mat, 
                         index_test_instance=idx_center))
    # select a subset of the k nearest neighbors
    final_neighbors_idx = np.random.permutation(k)[:subk]
    # 增加判断,判断随机生成的2-NN中不会有最初选定的序列,从而防止权重覆盖
    # fix a bug
    while idx_center in topk_idx[final_neighbors_idx]:
        final_neighbors_idx = np.random.permutation(k)[:subk]
    # adjust the weight of the selected neighbors 
    weights[topk_idx[final_neighbors_idx]] = weight_neighbors / subk
    # return the weights and the instance with maximum weight (to be used as 
    # init for DBA )
    return weights, init_dba
示例#5
0
def lvq3(prototypes, training_set, alpha=1, epsilon=.6):
    for row in training_set:

        closest1, closest2 = knn.get_neighbors(row, prototypes, 2)
        stabilizer = alpha
        if closest1[-1] == row[-1] and closest2[-1] == row[-1]:
            stabilizer = alpha * epsilon

        adjust(closest1, row, stabilizer)
        adjust(closest2, row, stabilizer)
    return prototypes
示例#6
0
def lvq2(prototypes, training_set, alpha=1, w=.6):
    for row in training_set:

        closest1, closest2 = knn.get_neighbors(row, prototypes, 2)
        if closest1[-1] == closest2[-1]:
            continue
        if closest1[-1] != row[-1] and closest2[-1] != row[-1]:
            continue
        if not windowed(w, row, closest1, closest2):
            continue

        adjust(closest1, row, alpha)
        adjust(closest2, row, alpha)
    return prototypes
示例#7
0
def lvq1(training_set, prototype_num, alpha=1):
    classes = set()
    for t in training_set:
        classes.add(t[-1])

    if prototype_num < len(classes):
        raise ValueError(
            'Number of prototypes ({}) must be equal or greater then number of classes ({})'
            .format(prototype_num, len(classes)))

    prototypes = [random_vector(training_set) for i in range(prototype_num)]

    for i in range(len(prototypes)):
        prototypes[i][-1] = list(classes)[i % len(classes)]

    for row in training_set:
        closest = knn.get_neighbors(row, prototypes, 1)[0]
        adjust(closest, row, alpha)
    return prototypes
示例#8
0
def perform_prediction(recipient_id, raw_input):
    input = raw_input.split(',')
    # Check of the input if of shape (x,y)
    if len(input) != 2:
        message = "Wrong input on prediction"
        bot.send_text_message(recipient_id, message)
        return

    # Get the user's training set
    train_to_fit = np.array(users[recipient_id].training_set)
    # Convert the input to predict into the right format
    to_predict = [int(input[0]), int(input[1])]

    neighbors = get_neighbors(training_set=train_to_fit,
                              test_instance=to_predict,
                              k=k)
    majority_vote = get_majority_vote(neighbors)
    print 'Predicted label=' + str(majority_vote)

    message = "Predicted label {}".format(str(majority_vote))
    bot.send_text_message(recipient_id, message)
示例#9
0
 # this is also a lopp over the names of the datasets
 for c in classes:
     # get the x_train without the test instances
     x_train = tot_x_train[np.where(tot_y_train != c)]
     # get the y_train without the test instances
     y_train = tot_y_train[np.where(tot_y_train != c)]
     # get the x_test instances
     x_test = tot_x_train[np.where(tot_y_train == c)]
     # init the distances
     distances = []
     # loop through each test instances
     for x_test_instance in x_test:
         # get the nearest neighbors
         distance_neighbors = get_neighbors(x_train,
                                            x_test_instance,
                                            0,
                                            dist_fun,
                                            dist_fun_params,
                                            return_distances=True)
         # concat the distances
         distances = distances + distance_neighbors
     # sort list by specifying the second item to be sorted on
     distances.sort(key=operator.itemgetter(1))
     # to numpy array the second item only (the label)
     distances = np.array([y_train[distances[i][0]] \
            for i in range(len(distances))])
     # aggregate the closest datasets
     # this is useful if two datasets are in the k nearest neighbors
     # more than once because they have more than one similar class
     distances = pd.unique(distances)
     # leave only the k nearest ones
     for i in range(1, nb_neighbors + 1):
示例#10
0
def test_get_neighbors(train, test_row, num_neighbors):
    neighbors = knn.get_neighbors(train, test_row, num_neighbors)
    print 'Neighbours:'
    for neighbor in neighbors:
        print(neighbor)