def run(audio_db, prepared_dataset=None, new_song=None): # prepare data training_set, test_set = load_dataset(audio_db, new_song) if prepared_dataset: training_set = pickle.load(prepared_dataset) shuffle(training_set) print 'Train set: ' + repr(len(training_set)) print 'Test set: ' + repr(len(test_set)) # generate predictions predictions = [] if not new_song: for x in range(len(test_set)): neighbors = get_neighbors(training_set, test_set[x], K_POINTS) result = predict(neighbors) predictions.append(result) print('> predicted=' + repr(result) + ', actual=' + repr(test_set[x][-1])) accuracy = get_accuracy(test_set, predictions) # check accuracy print 'Accuracy: ' + repr(accuracy) return dump_best_accuracy( dataset=training_set, accuracy=accuracy ) # Returns True If accuracy is bigger than threshold else False else: neighbors = get_neighbors(training_set, test_set[0], K_POINTS) return predict(neighbors)
def init(): trainingdataset = process_dataset("out.csv") create_tables() insert_training(trainingdataset) #print "data %s" % Data.select().count() testdataset = process_dataset("test.csv") try: os.unlink("graphoutput.tx") except OSError: pass for data in testdataset: cat = replace_category(data) neighbors = get_neighbors(data, k=10) with open("graphoutput.txt", "a") as f: for i in neighbors: d = i.distance print i.data.pprint() f.write("%s\n" % d) result = get_response(neighbors) print "%s %s >>>>>>>>>>>>> prediction: %s" % (cat,data[:-1], result) yes_no = raw_input("Desea validar el valor (y/[n])? ") if yes_no is 'y': result = raw_input("nuevo valor 1 or 0: ") data[-1] = result insert_data(*data)
def get_weights_average_selected(x_train, dist_pair_mat, distance_algorithm='dtw'): # get the distance function dist_fun = utils.constants.DISTANCE_ALGORITHMS[distance_algorithm] # get the distance function params dist_fun_params = utils.constants.DISTANCE_ALGORITHMS_PARAMS[ distance_algorithm] # get the number of dimenions num_dim = x_train[0].shape[1] # number of time series n = len(x_train) # maximum number of K for KNN max_k = 5 # maximum number of sub neighbors max_subk = 2 # get the real k for knn k = min(max_k, n - 1) # make sure subk = min(max_subk, k) # the weight for the center weight_center = 0.5 # the total weight of the neighbors weight_neighbors = 0.3 # total weight of the non neighbors weight_remaining = 1.0 - weight_center - weight_neighbors # number of non neighbors n_others = n - 1 - subk # get the weight for each non neighbor if n_others == 0: fill_value = 0.0 else: fill_value = weight_remaining / n_others # choose a random time series idx_center = random.randint(0, n - 1) # get the init dba init_dba = x_train[idx_center] # init the weight matrix or vector for univariate time series weights = np.full((n, num_dim), fill_value, dtype=np.float64) # fill the weight of the center weights[idx_center] = weight_center # find the top k nearest neighbors topk_idx = np.array( get_neighbors(x_train, init_dba, k, dist_fun, dist_fun_params, pre_computed_matrix=dist_pair_mat, index_test_instance=idx_center)) # select a subset of the k nearest neighbors final_neighbors_idx = np.random.permutation(k)[:subk] # adjust the weight of the selected neighbors weights[topk_idx[final_neighbors_idx]] = weight_neighbors / subk # return the weights and the instance with maximum weight (to be used as # init for DBA ) return weights, init_dba
def get_weights_average_selected(tseries, dist_pair_mat, distance_algorithm='DTW'): """ Calculate weights with average selected method :param array tseries: the list of time series :param array dist_pair_mat: the distance matrix :param string distance_algorithm: the distance algorithm :return: the weights of each sequence and the medoid of the tseries """ # get the number of the train set n = len(tseries) # maximum number of K for KNN max_k = 5 # maximum number of sub neighbors max_subk = 2 # get the real k for knn k = min(max_k, n - 1) # make sure subk = min(max_subk, k) # the weight for the center weight_center = 0.5 # the total weight of the neighbors weight_neighbors = 0.3 # total weight of the non neighbors weight_remaining = 1.0 - weight_center - weight_neighbors # number of non neighbors n_others = n - 1 - subk # get the weight for each non neighbor if n_others == 0 : fill_value = 0.0 else: fill_value = weight_remaining / n_others # choose a random time series idx_center = random.randint(0, n - 1) # get the init dba init_dba = tseries[idx_center] # init the weight matrix or vector for univariate time series weights = np.full(n, fill_value, dtype=np.float64) # fill the weight of the center weights[idx_center] = weight_center # find the top k nearest neighbors topk_idx = np.array(get_neighbors(tseries, init_dba, k, dist_fun, dist_fun_params, pre_computed_matrix=dist_pair_mat, index_test_instance=idx_center)) # select a subset of the k nearest neighbors final_neighbors_idx = np.random.permutation(k)[:subk] # 增加判断,判断随机生成的2-NN中不会有最初选定的序列,从而防止权重覆盖 # fix a bug while idx_center in topk_idx[final_neighbors_idx]: final_neighbors_idx = np.random.permutation(k)[:subk] # adjust the weight of the selected neighbors weights[topk_idx[final_neighbors_idx]] = weight_neighbors / subk # return the weights and the instance with maximum weight (to be used as # init for DBA ) return weights, init_dba
def lvq3(prototypes, training_set, alpha=1, epsilon=.6): for row in training_set: closest1, closest2 = knn.get_neighbors(row, prototypes, 2) stabilizer = alpha if closest1[-1] == row[-1] and closest2[-1] == row[-1]: stabilizer = alpha * epsilon adjust(closest1, row, stabilizer) adjust(closest2, row, stabilizer) return prototypes
def lvq2(prototypes, training_set, alpha=1, w=.6): for row in training_set: closest1, closest2 = knn.get_neighbors(row, prototypes, 2) if closest1[-1] == closest2[-1]: continue if closest1[-1] != row[-1] and closest2[-1] != row[-1]: continue if not windowed(w, row, closest1, closest2): continue adjust(closest1, row, alpha) adjust(closest2, row, alpha) return prototypes
def lvq1(training_set, prototype_num, alpha=1): classes = set() for t in training_set: classes.add(t[-1]) if prototype_num < len(classes): raise ValueError( 'Number of prototypes ({}) must be equal or greater then number of classes ({})' .format(prototype_num, len(classes))) prototypes = [random_vector(training_set) for i in range(prototype_num)] for i in range(len(prototypes)): prototypes[i][-1] = list(classes)[i % len(classes)] for row in training_set: closest = knn.get_neighbors(row, prototypes, 1)[0] adjust(closest, row, alpha) return prototypes
def perform_prediction(recipient_id, raw_input): input = raw_input.split(',') # Check of the input if of shape (x,y) if len(input) != 2: message = "Wrong input on prediction" bot.send_text_message(recipient_id, message) return # Get the user's training set train_to_fit = np.array(users[recipient_id].training_set) # Convert the input to predict into the right format to_predict = [int(input[0]), int(input[1])] neighbors = get_neighbors(training_set=train_to_fit, test_instance=to_predict, k=k) majority_vote = get_majority_vote(neighbors) print 'Predicted label=' + str(majority_vote) message = "Predicted label {}".format(str(majority_vote)) bot.send_text_message(recipient_id, message)
# this is also a lopp over the names of the datasets for c in classes: # get the x_train without the test instances x_train = tot_x_train[np.where(tot_y_train != c)] # get the y_train without the test instances y_train = tot_y_train[np.where(tot_y_train != c)] # get the x_test instances x_test = tot_x_train[np.where(tot_y_train == c)] # init the distances distances = [] # loop through each test instances for x_test_instance in x_test: # get the nearest neighbors distance_neighbors = get_neighbors(x_train, x_test_instance, 0, dist_fun, dist_fun_params, return_distances=True) # concat the distances distances = distances + distance_neighbors # sort list by specifying the second item to be sorted on distances.sort(key=operator.itemgetter(1)) # to numpy array the second item only (the label) distances = np.array([y_train[distances[i][0]] \ for i in range(len(distances))]) # aggregate the closest datasets # this is useful if two datasets are in the k nearest neighbors # more than once because they have more than one similar class distances = pd.unique(distances) # leave only the k nearest ones for i in range(1, nb_neighbors + 1):
def test_get_neighbors(train, test_row, num_neighbors): neighbors = knn.get_neighbors(train, test_row, num_neighbors) print 'Neighbours:' for neighbor in neighbors: print(neighbor)