def unsuper_simple_rasar_multiclass(X_train, X_test, y_train, y_test): df_simple_train = pd.DataFrame() df_simple_test = pd.DataFrame() for i in range(1,6): # in order to train K-NN X_train_i = X_train[y_train == i].copy() ########################## ######## DF RASAR -- TRAIN ########################## knn_train = KNeighborsClassifier(n_jobs = -2, leaf_size = 30, n_neighbors = 2) knn_train.fit(X_train_i, y_train[y_train == i]) neigh_i = knn_train.kneighbors(X_train, return_distance = True) idx_neigh_i, dist_neigh_i = right_neighbor(neigh_i, X_train, X_train_i) df_simple_train.loc[:, 'LC50_MOR_' + str(i)] = dist_neigh_i ########################## ######## DF RASAR -- TEST ########################## knn_test = KNeighborsClassifier(n_jobs = -2, leaf_size = 30, n_neighbors = 1) knn_test.fit(X_train_i, y_train[y_train == i]) neigh_i = knn_test.kneighbors(X_test, return_distance = True) df_simple_test.loc[:, 'LC50_MOR_' + str(i)] = neigh_i[0].ravel() return df_simple_train, df_simple_test
class Search_Engine(object): def __init__(self, n_neighbors=1, thre=0.3, rej= 'top' ,weights='uniform', algorithm='ball_tree', metric='euclidean'): self.engin = KNeighborsClassifier(n_neighbors=n_neighbors,weights=weights, algorithm=algorithm, metric=metric) self.thre = thre if rej not in {'top','mean'}: raise ValueError('No such rej mean') self.rej = rej self.n_neighbors = n_neighbors def fit(self,feas,labels): if feas.ndim == 2: fea_norm = np.linalg.norm(feas, axis=1)[:, np.newaxis] else: raise Exception('Wrong dimension') feas = feas/fea_norm self.engin.fit(feas,labels) def predict(self, query): if query.ndim == 1: query = query/np.linalg.norm(query) else: raise Exception('Wrong dimension') if self.rej == 'top': dis = self.engin.kneighbors(query,1)[0][0][0] elif self.rej == 'mean': dis = np.mean(self.engin.kneighbors(query,self.n_neighbors/2+1)[0][0]) if dis > self.thre: label = -1 else: label = self.engin.predict(query)[0] return label,dis
def synthesize_hybrid(self, data, label): """ 处理混合聚类簇 :param data: :param label: :return: """ major_data = [] minor_data = [] major_label = [] minor_label = [] synthetics = [] # 将一个聚类簇的多数类和少数类分开 for data_, label_ in zip(data, label): if label_ == 1.: minor_data.append(data_) minor_label.append(label_) else: major_data.append(data_) major_label.append(label_) border_minor = [] border_major = [] knn_major = KNeighborsClassifier(n_neighbors=1) knn_minor = KNeighborsClassifier(n_neighbors=1) knn_minor.fit(X=minor_data, y=minor_label) knn_major.fit(X=major_data, y=major_label) # 寻找边界多数类和少数类 for major in major_data: index = knn_minor.kneighbors(X=[major], n_neighbors=1, return_distance=False)[0][0] border_minor.append(minor_data[index]) for minor in minor_data: index = knn_major.kneighbors(X=[minor], n_neighbors=1, return_distance=False)[0][0] border_major.append(major_data[index]) border_minor = self.set(border_minor) border_major = self.set(border_major) n_neighbors_major = self.n n_neighbors_minor = self.n if n_neighbors_minor > len(minor_data): n_neighbors_minor = len(minor_data) if n_neighbors_major > len(major_data): n_neighbors_major = len(major_data) # 合成少数类样本 for minor in minor_data: if minor in border_minor: index = knn_major.kneighbors(X=[minor], n_neighbors=n_neighbors_major, return_distance=False)[0] length = len(index) for i in range(self.n): index_ = index[i % length] point_y = major_data[index_] synthetics.append(self.synthesize(point_x=minor, point_y=point_y)) else: index = knn_minor.kneighbors(X=[minor], n_neighbors=n_neighbors_minor, return_distance=False)[0] index = index[1:] length = len(index) for i in range(self.n): index_ = index[i % length] point_y = minor_data[index_] synthetics.append(self.synthesize(point_x=minor, point_y=point_y)) return synthetics
def print_acc(emb_table): classifier = KNeighborsClassifier(n_neighbors=20) classifier.fit(emb_table, range(len(emb_table))) classifier_cos = KNeighborsClassifier(n_neighbors=20, metric='cosine') classifier_cos.fit(emb_table, range(len(emb_table))) artist_range = range(len(artist_related)) id_sim = 0 id_sim_cos = 0 for _ in range(1000): a = random.choice(artist_range) emb_related = classifier.kneighbors(emb_table[a].reshape(1, -1), len(artist_related[a])) emb_related_cos = classifier_cos.kneighbors(emb_table[a].reshape(1, -1), len(artist_related[a])) spot_id = set(artist_related[a]) emb_id = set(emb_related[1][0]) emb_id_cos = set(emb_related_cos[1][0]) id_sim += float(len(emb_id & spot_id)) / len(emb_id | spot_id) * 100 id_sim_cos += float(len(emb_id_cos & spot_id)) / len(emb_id_cos | spot_id) * 100 mse_loss = 0 mse_mean = 0 rand_mse_loss = 0 rand_mse_mean = 0 for a, r in artist_related.items(): related_indices = list(r) random_indices = random.sample(artist_range, len(related_indices)) repeated_artist_indices = [a] * len(related_indices) mse_vals = (np.square(emb_table[repeated_artist_indices] - emb_table[related_indices])).mean(axis=1) mse_loss += mse_vals.sum(axis=0) mse_mean += mse_vals.mean(axis=0) rand_mse_vals = (np.square(emb_table[repeated_artist_indices] - emb_table[random_indices])).mean(axis=1) rand_mse_loss += rand_mse_vals.sum(axis=0) rand_mse_mean += rand_mse_vals.mean(axis=0) print(f'ID Similarity MSE : {id_sim / 1000:.20f}') print(f'ID Similarity COS : {id_sim_cos / 1000:.20f}') print(f'Related Total MSE : {mse_loss:.20f}') print(f'Random Total MSE : {rand_mse_loss:.20f}') print(f'Total MSE Ratio : {rand_mse_loss / mse_loss:.20f}') print(f'Related Average MSE : {mse_mean / len(artist_related):.20f}') print(f'Random Average MSE : {rand_mse_mean / len(artist_related):.20f}') print(f'Average MSE Ratio : {(rand_mse_mean / len(artist_related)) / (mse_mean / len(artist_related)):.20f}') with open(model_name + '_temp.emb.pickle', 'wb') as f: save = { 'embedding_lookup': emb_table } pickle.dump(save, f, pickle.HIGHEST_PROTOCOL) del save
def generate_nuclei_centers(self, n_pix_per_nuc=9, dtype='int32'): ''' Generates the nuclei randomly within a cell -------------------- parameters: n_pix_per_nuc: number of pixels in a nucleus (default: 9) dtype: data type for the nuclei tensor (default: int16) -------------------- adds: self.nuclei: 3d tensor of nuclei, -1 is no nucleus, number is according to the cell ID ''' if self.cell_centers is None: self.generate_cell_centers() nuclei = np.zeros(self.true_map.shape, dtype=dtype) nuclei -= 1 check_nuclei_surroundings = KNeighborsClassifier(n_neighbors=27) non_zero_cell_locs = np.where(self.true_map != -1) cell_ids = self.true_map.ravel() xs = non_zero_cell_locs[0] ys = non_zero_cell_locs[1] zs = non_zero_cell_locs[2] cell_ids = cell_ids[cell_ids != -1] check_nuclei_surroundings.fit(np.vstack((xs, ys, zs)).T, cell_ids) for i in np.unique(self.cell_ids): if i != -1: cell_coords = np.where(self.true_map == i) rand_index = randint(0, len(cell_coords[0]) - 1) xs = cell_coords[0] ys = cell_coords[1] zs = cell_coords[2] locs_as_mat = np.vstack((xs, ys, zs)).T clf_seed = KNeighborsClassifier( n_neighbors=min(n_pix_per_nuc, locs_as_mat.shape[0])) nuclei_seed = locs_as_mat[rand_index, :] clf_seed.fit(locs_as_mat, np.arange(locs_as_mat.shape[0])) nuc_pix_locs = clf_seed.kneighbors([nuclei_seed])[1][0] nuc_pix = locs_as_mat[nuc_pix_locs, :] non_same_celltype = cell_ids[ check_nuclei_surroundings.kneighbors(nuc_pix)[1]] #remove nuc pixels that are on the border nuc_pix = nuc_pix[np.sum(non_same_celltype == i, axis=1) == 27, :] if nuc_pix.shape[0] > 0: nuclei[nuc_pix[:, 0], nuc_pix[:, 1], nuc_pix[:, 2]] = i else: nuclei[nuclei_seed[0], nuclei_seed[1], nuclei_seed[2]] = i #print(np.unique(nuclei)) self.nuclei = nuclei
def unsuper_simple_rasar(train_distance_matrix, test_distance_matrix, X_train, X_test, y_train, y_test): ######## starting DATAFRAME ########## X_train0 = X_train[y_train == 0].copy() X_train1 = X_train[y_train == 1].copy() # in order to train 1-NN dist_matr_train_0 = train_distance_matrix.iloc[y_train == 0, y_train == 0] dist_matr_train_1 = train_distance_matrix.iloc[y_train == 1, y_train == 1] # To find neighbors for train experiments --> df_rasar_train dist_matr_train_train_0 = train_distance_matrix.iloc[:, y_train == 0] dist_matr_train_train_1 = train_distance_matrix.iloc[:, y_train == 1] # To find neighbors for test experiments --> df_rasar_test dist_matr_test_train_0 = test_distance_matrix.iloc[:, y_train == 0] dist_matr_test_train_1 = test_distance_matrix.iloc[:, y_train == 1] ####### DF train RASAR ############### # finding the nearest 0s experiments for training experiments that is not itself knn0 = KNeighborsClassifier(metric='precomputed', n_jobs=-2, n_neighbors=2) knn0.fit(dist_matr_train_0, y_train[y_train == 0]) neigh0 = knn0.kneighbors(dist_matr_train_train_0, return_distance=True) _, dist0 = right_neighbor(neigh0, X_train, X_train0) # finding the nearest 1s experiments for training experiments that is not itself knn1 = KNeighborsClassifier(metric='precomputed', n_jobs=-2, n_neighbors=2) knn1.fit(dist_matr_train_1, y_train[y_train == 1]) neigh1 = knn1.kneighbors(dist_matr_train_train_1, return_distance=True) _, dist1 = right_neighbor(neigh1, X_train, X_train1) df_rasar_train = pd.DataFrame({'dist_neigh0': dist0, 'dist_neigh1': dist1}) ####### DF test RASAR ################ # finding the nearest 0s experiments to test data knn0 = KNeighborsClassifier(metric='precomputed', n_neighbors=1, n_jobs=-2) knn0.fit(dist_matr_train_0, y_train[y_train == 0]) neigh0 = knn0.kneighbors(dist_matr_test_train_0, return_distance=True) # idx_neigh_0 = pd.DataFrame(neigh0[1])[0].apply(lambda x: X_train.iloc[y_train==0].iloc[x].name) # finding the nearest 1s experiments to test data knn1 = KNeighborsClassifier(metric='precomputed', n_neighbors=1, n_jobs=-2) knn1.fit(dist_matr_train_1, y_train[y_train == 1]) neigh1 = knn1.kneighbors(dist_matr_test_train_1, return_distance=True) # idx_neigh_1 = pd.DataFrame(neigh1[1])[0].apply(lambda x: X_train.iloc[y_train==1].iloc[x].name) df_rasar_test = pd.DataFrame({ 'dist_neigh0': neigh0[0].ravel(), 'dist_neigh1': neigh1[0].ravel() }) return df_rasar_train, df_rasar_test
def get_summary(positive_sentences, negative_sentences, num_clusters=3): #Tokenize the sentences print("Tokenizing") pos_token = [tokenizer(i) for i in positive_sentences] neg_token = [tokenizer(i) for i in negative_sentences] #Preparing Vocabulary print("Preparing vocabulary") stop = set(stopwords.words('english')) vocab = set(pos_model.wv.vocab) - stop vocab = [i for i in vocab if can_be_adjective(i)] s1 = [] for sent in pos_token: for word in sent: if word in vocab: s1.append(pos_model[word]) s1 = np.array(s1) vocab = set(neg_model.wv.vocab) - stop vocab = [i for i in vocab if can_be_adjective(i)] s2 = [] for sent in neg_token: for word in sent: if word in vocab: s2.append(neg_model[word]) s2 = np.array(s2) #Clustering print("Clustering") pos_kmeans = KMeans(n_clusters=num_clusters).fit(s1) pos_centers = pos_kmeans.cluster_centers_ pos_neigh = KNeighborsClassifier(n_neighbors=1) pos_neigh.fit(s1, pos_kmeans.labels_) neg_kmeans = KMeans(n_clusters=num_clusters).fit(s2) neg_centers = neg_kmeans.cluster_centers_ neg_neigh = KNeighborsClassifier(n_neighbors=1) neg_neigh.fit(s2, neg_kmeans.labels_) print("Most significant words") print("--Positives--") for i in pos_centers: print(vocab[pos_neigh.kneighbors(i.reshape(1, -1), return_distance=False)[0, 0]]) print("\n\n") print("--Negatives--") for i in neg_centers: print(vocab[neg_neigh.kneighbors(i.reshape(1, -1), return_distance=False)[0, 0]])
def testing(input): vect = CountVectorizer() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) X_ = vect.fit_transform(X) x_test = vect.transform(X_test) knn = KNeighborsClassifier(n_neighbors=1, metric='euclidean') knn.fit(X_, y) y_pred = knn.predict(x_test) test = vect.transform([input]) nearest_neighbor = df['QUES'][knn.kneighbors(test, 1)[1][0][0]] ans = df['ANS'][knn.kneighbors(test, 1)[1][0][0]] label = get_key(knn.predict(test)[0]) accuracy = metrics.accuracy_score(y_test, y_pred) return nearest_neighbor, ans, label, accuracy
def process_grid_cell(train, test, grid_id, threshold, model, grid_variable): """ Creates model and generates predictions for row_ids in a particular grid cell. """ start = time.time() # Filter data onto single grid cell train_cell = train[train[grid_variable] == grid_id] test_cell = test[test[grid_variable] == grid_id] test_ids = test_cell.index # Remove place ids from train data with frequency below threshold place_counts = train_cell.place_id.value_counts() mask = place_counts[train_cell.place_id.values] >= threshold train_cell = train_cell.loc[mask.values] # Encode place id as labels le = LabelEncoder() y_train = le.fit_transform(train_cell.place_id.values) X_train = train_cell.drop(['place_id', grid_variable], axis = 1).values X_test = test_cell.drop(['place_id', grid_variable], axis = 1).values # NN as features model_nn = KNeighborsClassifier(n_neighbors = 31, n_jobs = -1, weights = 'distance', metric = 'manhattan') model_nn.fit(X_train, y_train) train_neighbors = pd.DataFrame(model_nn.kneighbors(X_train, n_neighbors = 31, return_distance = True)[0]) test_neighbors = pd.DataFrame(model_nn.kneighbors(X_test, n_neighbors = 31, return_distance = True)[0]) train_nn_cols = train_neighbors.columns test_nn_cols = test_neighbors.columns train_cell[train_nn_cols] = train_neighbors train_cell[train_nn_cols] = train_neighbors.values test_cell[test_nn_cols] = test_neighbors test_cell[test_nn_cols] = test_neighbors.values X_train = train_cell.drop(['place_id', grid_variable], axis = 1).values X_test = test_cell.drop(['place_id', grid_variable], axis = 1).values # Build training classifier and predict model.fit(X_train, y_train) y_pred = model.predict_proba(X_test) pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3]).astype(str) end = time.time() time_elapsed = (end - start) # Generate CV score map3 = MAP3(test_cell['place_id'], pred_labels) # Return data return pred_labels, test_ids, time_elapsed, map3
def para_func2(arg): num, shape, shape2, metric, cnum = arg X = _sharedX X2 = _sharedX2 centers = choice(X.shape[0], cnum, False) mod = KClass(1, metric=metric) mod.fit(X[centers, :], range(centers.size)) dista1, ma1 = mod.kneighbors(X, return_distance=True) distb1, mb1 = mod.kneighbors(X2, return_distance=True) mall = ma1 mall2 = mb1 return mall2, mall
def neighbors(data, pred_c, new_user): knn = KNeighborsClassifier(n_neighbors=5) knn.fit(data, pred_c) n_knn = knn.predict(new_user) y, idx = knn.kneighbors(new_user, n_neighbors=5) print("距离新用户距离最近的用户:", idx) return idx
class Identifier: def __init__(self): self.model = KNeighborsClassifier(n_neighbors=1, n_jobs=4) self.threshold = 0.7 self.data = {} self.data_classes = [] def identify(self, face): if face.embedding is not None: nearestest_neighbour = self.model.kneighbors([face.embedding]) distance = nearestest_neighbour[0][0][0] print('nearestest_neighbour: {}'.format(nearestest_neighbour)) if distance >= self.threshold: return 'Unknown' return self.data_classes[nearestest_neighbour[1][0][0]] def add_identities(self, faces): for face in faces: if face.name not in self.data: self.data[face.name] = [] self.data[face.name].append(face.embedding) data = [] classes = [] for name, deep_features in self.data.items(): for deep_feature in deep_features: data.append(deep_feature) classes.append(name) self.data_classes = classes self.model.fit(data, classes) return face
def get_recommnedations(X, y, n_neighs=100): neigh = KNeighborsClassifier(n_neighbors=n_neighs, metric='cosine') neigh.fit(X, y) distances, neigh_ids = neigh.kneighbors(X, n_neighs) titles_recommendations = {} for i, title in enumerate(y): titles_scores = [distances[i]] titles_neighs = [neigh_ids[i]] normalized_scores = 1 - normalize(titles_scores) # #normalized_scores = normalize(titles_scores) grouped_title_scores = {} for idx, titles_id in enumerate(titles_neighs): neighs_title_name = y[tuple(titles_neighs)] neighs_title_distance = normalized_scores[idx] for neigh_title, neigh_distance in zip(neighs_title_name, neighs_title_distance): grouped_title_scores.setdefault(neigh_title, []).append(neigh_distance) titles_recommendations[title] = sorted( [(tit, dist[0]) for tit, dist in grouped_title_scores.items()], key=lambda x: x[1], reverse=True ) return titles_recommendations
def get_types(self, X: np.ndarray, y: np.ndarray) -> np.ndarray: knn = KNeighborsClassifier(n_neighbors=5) knn.fit(X, y) types = [] neibors = knn.kneighbors( X, n_neighbors=6, return_distance=False) #TODO correct spelling mistake actual_neibors = neibors[:, 1:] # counts it self its own neighbor for i in range(len(actual_neibors)): instance_neighbors_of_the_same_class = 0 for neibor in actual_neibors[i]: if y[i] == y[neibor]: instance_neighbors_of_the_same_class += 1 if instance_neighbors_of_the_same_class >= 4: types.append(Types.SAFE) continue elif instance_neighbors_of_the_same_class >= 2: types.append(Types.BORDERLINE) continue elif instance_neighbors_of_the_same_class >= 1: types.append(Types.RARE) continue else: types.append(Types.OUTLIER) return np.array(types)
class KNN(): def __init__(self, knn, stage): self.knn = knn _, self.kernel_pca = pre_process(stage) sample_pos, sample_neg, X_c1, self.Y_c1 = read_label_data(stage) X_reduced = self.kernel_pca.transform(X_c1) self.knn1 = KNeighborsClassifier(n_neighbors=1, weights='uniform') self.y = [] self.knn1.fit(X_reduced, self.Y_c1) self.knn2 = KNeighborsClassifier(n_neighbors=knn, weights='distance') self.knn2.fit(X_reduced, self.Y_c1) def predict(self, sample): prop = [] X_reduced_valid = self.kernel_pca.transform(sample) for j in range(X_reduced_valid.shape[0]): kNeighbour = self.knn2.kneighbors([X_reduced_valid[j]], n_neighbors=self.knn)[1] if np.min(self.Y_c1[kNeighbour]) <= 0: self.y.append(0) else: self.y.append(1) prop.append(np.mean(self.Y_c1[kNeighbour])) y = np.array(self.y) self.y=[] return y, np.array(prop)
def hybrid_precessor(self, data, label): major = [] major_label = [] minor = [] minor_label = [] border_major = [] border_major_label = [] for data_, label_ in zip(data, label): if label == 1.0: minor.append(data_) minor_label.append(label_) else: major.append(data_) major_label.append(label_) knn = KNeighborsClassifier(n_neighbors=1) knn.fit(X=major, y=major_label) for m in minor: neighbor = knn.kneighbors(X=[m], n_neighbors=1, return_distance=False)[0][0] border_major.append(major[neighbor]) border_major_label.append(0.0) return_data = [] return_label = [] for d in major: if d not in border_major: return_data.append(d) return_label.append(0.0) return return_data, return_label, border_major, border_major_label
class KNORAU(BaseEstimator, ClassifierMixin): """ Implementation of the KNORA-Union des method. """ def __init__(self, ensemble=[], k=7, metric="euclidean"): self.ensemble = ensemble self.k = k self.metric = metric def fit(self, X, y): self.X_dsel = X self.y_dsel = y self.knn = KNeighborsClassifier(n_neighbors=self.k, metric="euclidean") self.knn.fit(self.X_dsel, self.y_dsel) def estimate_competence(self, X): self.competences = np.zeros((X.shape[0], len(self.ensemble))) _, self.neighbors = self.knn.kneighbors(X=X, n_neighbors=self.k) local_X = np.reshape(self.X_dsel[self.neighbors], (-1, X.shape[-1])) local_y = np.reshape(self.y_dsel[self.neighbors], (-1)) self.competences = np.sum( np.array([ np.reshape( clf.predict(local_X) == local_y, (X.shape[0], self.k)) for clf in self.ensemble ]), axis=2, ).T # print(self.competences) def ensemble_matrix(self, X): """EM.""" return np.array( [member_clf.predict(X) for member_clf in self.ensemble]).T def predict(self, X): if self.X_dsel.shape[0] >= self.k: self.estimate_competence(X) em = self.ensemble_matrix(X) predict = [] for i, row in enumerate(em): decision = np.bincount(row, weights=self.competences[i]) predict.append(np.argmax(decision)) else: em = self.ensemble_matrix(X) predict = [] for i, row in enumerate(em): decision = np.bincount(row) predict.append(np.argmax(decision)) return np.array(predict) def score(self, X, y): return balanced_accuracy_score(y, self.predict(X))
def KnnPrediction(df_Movies,df_movie_id): movie_id = df_movie_id.iloc[0] cluster = df_Movies[df_Movies["tconst"] == movie_id]["cluster"].iloc[0] df_inter=df_Movies.loc[df_Movies['cluster']==cluster] if df_inter.shape[0] < 6 : df_Cluster = df_Movies else : columns=['isAdult','startYear','runtimeMinutes','averageRating','numVotes'] X=df_inter[columns] y=df_inter['tconst'] model_KNN = KNeighborsClassifier(n_neighbors=5) model_KNN.fit(X,y) MovieTemp = model_KNN.kneighbors(df_inter.loc[df_inter['tconst']==movie_id, columns],n_neighbors=6) clusterList = [] for i in range(1,6): clusterList.append(df_inter.iloc[MovieTemp[1][0][i]]['tconst']) df_Cluster = df_Movies[df_Movies["tconst"].isin(clusterList)] return df_Cluster
def classify(data): X, y = generate_X_and_Y() X_data = strip_song_and_artist(X) X_scaled = scale(X_data) knn = KNeighborsClassifier(n_neighbors=20) knn.fit(X_data, y) data_scaled = scale(data) print("Predicted Mood:", knn.predict(data)) print("accuracy:", knn.score(X_data, y)) distances, indices = knn.kneighbors(data, n_neighbors=20) # Moods + songs & artists of K nearest neighbors moods = [y[index] for index in indices[0]] songs_and_artists = [X[index][0:2] for index in indices[0]] sa_classes = [y[index] for index in indices[0]] # print("moods:") # pprint(moods) # print("indices:") # pprint(indices) for i in range(0, len(songs_and_artists)): print(songs_and_artists[i]) print(sa_classes[i]) # print("songs and artists:") # pprint(songs_and_artists) # print("their moods:") # pprint(sa_classes) return moods
def ex713(): L=40 # Cross-validation not necessary. Instead, compute matrix of nearest neighbor # distances between each pair of data points .. knclassifier = KNeighborsClassifier(n_neighbors=L+1).fit(X, ravel(y)) neighbors = knclassifier.kneighbors(X) # .. and extract matrix where each row contains class labels of subsequent neighbours # (sorted by distance) ndist, nid = neighbors[0], neighbors[1] print len(ndist) print len(nid) print "="*20 nclass = y[nid].flatten().reshape(N,L+1) # Use the above matrix to compute the class labels of majority of neighbors # (for each number of neighbors l), and estimate the test errors. errors = np.zeros(L) nclass_count = np.zeros((N,C)) for l in range(1,L+1): for c in range(C): nclass_count[:,c] = sum(nclass[:,1:l+1]==c,1).A.ravel() y_est = np.argmax(nclass_count,1); errors[l-1] = (y_est!=y.A.ravel()).sum() # Plot the classification error rate figure(1) plot(100*errors/N) xlabel('Number of neighbors') ylabel('Classification error rate (%)') figure(2) imshow(nclass, cmap='binary', interpolation='None'); xlabel("k'th neighbor"); ylabel('data point'); title("Neighbors class matrix"); show()
def tunning_eps(data): zeros = [0] * 563 neigh = KNeighborsClassifier(n_neighbors=5).fit(X=data, y=zeros) distances, indices = neigh.kneighbors(data) fig, ax = plt.subplots(figsize=(8, 6)) sort = np.sort(distances, axis=0) ax.plot(np.linspace(0, N_IMAGES, num=N_IMAGES), sort[:, 4])
class KNN_model(): def __init__(self): self.datasets=datasets() self.movie_name = self.datasets.get_movie_name() self.movie_dict=self.datasets.get_movie_dict() self.movie_score = self.datasets.get_movie_score() self.OneHot=self.datasets.get_OneHot() self.knn = KNeighborsClassifier(n_neighbors=3) self.knn.fit(self.OneHot, range(0,len(self.OneHot))) def knn_predict(self, movie, neighbors=10, out_number=5): self.location_id = self.movie_dict[movie] self.movie_id = self.OneHot[self.location_id] self.neighbors = self.knn.kneighbors(np.reshape(self.movie_id,[-1,self.OneHot.shape[-1]]), neighbors, False) self.get_neoghbor_name = np.reshape(self.movie_name[self.neighbors],[-1,neighbors]) self.get_neoghbor_score = np.reshape(self.movie_score[self.neighbors], [-1,neighbors]) self.id = np.argsort(self.get_neoghbor_score) self.result = [] for i in range(0, len(self.get_neoghbor_name)): rst = self.get_neoghbor_name[i][self.id[i]].tolist() rst.remove(movie) self.result.append(rst[0:out_number]) return self.result # iris_y_predict = knn.predict(iris_x_test) # print(iris_y_predict)
def show_neighbors(item_names, item_features, i=None): # item_names = np.array of n_samples names # item_features = np.array of n_samples x D encoded features # i = item to show neighbors for if i is None: i = np.random.choice(range(len(item_names))) neigh = KNeighborsClassifier(n_neighbors=20) neigh.fit(item_features, item_names) # i = 10822 nei_items = item_names[neigh.kneighbors(item_features[[i], :])[1]] nei_dists = neigh.kneighbors(item_features[[i], :])[0] print(item_names[i]) print(nei_items) print(nei_dists)
def recommend_by_userid(user_id: int): # load user and movie embeddings user_embeddings = np.load(USER_EMBED_PATH) movie_embeddings = np.load(MOVIE_EMBED_PATH) # load user, movie and user_movie mappings uid_lbl_mapping = pickle.load(open(USER_LABEL_MAPPING, "rb")) mid_lbl_mapping = pickle.load(open(MOVIE_LABEL_MAPPING, "rb")) lbl_mid_mapping = pickle.load(open(LABEL_MOVIE_MAPPING, "rb")) user_movie_mapping = pickle.load(open(USER_MOVIE_MAPPING, "rb")) id_title_mapping = pickle.load(open(ID_TITLE_MAPPING, "rb")) user_label = uid_lbl_mapping.get(user_id) user_embedding = user_embeddings[user_label] user_watched_movies = user_movie_mapping[user_id] movies = list(mid_lbl_mapping.keys()) user_unwatched_movies = list(set(movies) - set(user_watched_movies)) user_unwatched_movies_labels = [mid_lbl_mapping[mid] for mid in user_unwatched_movies] clf = KNeighborsClassifier(n_neighbors=11) unwatched_movie_embeddings = movie_embeddings[user_unwatched_movies_labels] clf.fit(unwatched_movie_embeddings, user_unwatched_movies_labels) distances, indices = clf.kneighbors(user_embedding.reshape(1, -1), n_neighbors=10) distances, indices = zip(*sorted(zip(distances[0], indices[0]))) distances, indices = list(distances), list(indices) sorted_movie_ids = [lbl_mid_mapping[m_idx] for m_idx in indices if m_idx != 0] recommend_movies = [id_title_mapping[mid] for mid in sorted_movie_ids] print("Recommended movies:", recommend_movies) return recommend_movies
class PriorNetwork(nn.Module): def __init__(self, size_training_set, code_length, n_hidden=512, k=5, random_seed=4543): super(PriorNetwork, self).__init__() self.rdn = np.random.RandomState(random_seed) self.k = k self.size_training_set = size_training_set self.code_length = code_length self.fc1 = nn.Linear(self.code_length, n_hidden) self.fc2_u = nn.Linear(n_hidden, self.code_length) self.fc2_s = nn.Linear(n_hidden, self.code_length) self.knn = KNeighborsClassifier(n_neighbors=self.k, n_jobs=-1) # codes are initialized randomly - Alg 1: initialize C: c(x)~N(0,1) codes = self.rdn.standard_normal( (self.size_training_set, self.code_length)) self.fit_knn(codes) def fit_knn(self, codes): ''' will reset the knn given an nd array ''' st = time.time() self.codes = codes assert (len(self.codes) > 1) y = np.zeros((len(self.codes))) self.knn.fit(self.codes, y) def batch_pick_close_neighbor(self, codes): ''' :code latent activation of training example as np ''' neighbor_distances, neighbor_indexes = self.knn.kneighbors( codes, n_neighbors=self.k, return_distance=True) bsize = neighbor_indexes.shape[0] if self.training: # randomly choose neighbor index from top k chosen_neighbor_index = self.rdn.randint(0, neighbor_indexes.shape[1], size=bsize) else: chosen_neighbor_index = np.zeros((bsize), dtype=np.int) return self.codes[neighbor_indexes[np.arange(bsize), chosen_neighbor_index]] def forward(self, codes): st = time.time() np_codes = codes.cpu().detach().numpy() previous_codes = self.batch_pick_close_neighbor(np_codes) previous_codes = torch.FloatTensor(previous_codes).to(DEVICE) return self.encode(previous_codes) def encode(self, prev_code): h1 = F.relu(self.fc1(prev_code)) mu = self.fc2_u(h1) logstd = self.fc2_s(h1) return mu, logstd
def main(): list_data = training() dict_vect, entity_names = entities(list_data) vec = DictVectorizer() transformer = TfidfTransformer() vectors = vec.fit_transform(dict_vect).toarray() #tfidf_vectors = transformer.fit_transform(vectors).toarray() clf = GaussianNB().fit(vectors, entity_names) ne = KNeighborsClassifier(n_neighbors=5).fit(vectors, entity_names) #clf = MultinomialNB().fit(tfidf_vectors, entity_names) #clf = BernoulliNB().fit(tfidf_vectors, entity_names) allfiles = glob.glob('test/*.txt') for each in allfiles: with open(each, 'r') as f: data = f.read() dict_vectors, entity_n = test(data) vectors_pred = vec.transform(dict_vectors).toarray() #tfidf_vectors_pred = transformer.transform(vectors_pred).toarray() pred = clf.predict(vectors_pred) from sklearn.metrics import accuracy_score #print(accuracy_score(entity_names,clf.predict(vectors))) predict = ne.kneighbors(vectors_pred, n_neighbors= 5,return_distance=False) print("\n \nPredictions for the given file: \t"+each) print("\n GussianNB algorithm prediction: \t") for x,y in zip(entity_n,pred): print("\ninput word in the given file: \t"+x,"\n predicted word for the input: \t"+y) print("\n5 Nearest neighbors using k-nearest neighbors algorithm prediction: \n") for each,z in zip(predict,entity_n): print("The input word given in the file:\t"+z+"\n") for each1 in each: print(entity_names[each1]) print("\n")
class Recommender: def __init__(self, data, num_neighbors=20, metric='cosine'): print('Fit') self.data = data self.kn = KNeighborsClassifier(n_neighbors=num_neighbors, weights='distance', algorithm='brute', metric=metric, p=2) matrix = data.to_sparse_matrix() confs_name = ['' for i in range(data.conferences_set.__len__())] for conf in data.conferences_set: confs_name[data.conf_to_num[conf]] = conf self.kn.fit(matrix, confs_name) def recommend(self, user_conferences, bound=0.4, num_neighbors=10): print('Predict') rec_conf_with_dist = {i:0 for i in self.data.conferences_set} rec_conf_number = {i:0 for i in self.data.conferences_set} for c in user_conferences: number_of_current_user_conf = self.data.conf_to_num[c] dst1, ind1 = self.kn.kneighbors(self.data.conf_to_vector(c), return_distance=True, n_neighbors=num_neighbors) conf_with_dist = zip(normalize(dst1[0])[0], ind1[0]) for dist, conf in conf_with_dist: rec_conf_with_dist[self.data.num_to_conf[conf]] += dist rec_conf_number[self.data.num_to_conf[conf]] += 1 for conf, number in rec_conf_number.items(): if number != 0 and number != 1: rec_conf_with_dist[conf] /= number result = sorted(rec_conf_with_dist.items(), key=operator.itemgetter(1)) result = list(filter(lambda tmp: tmp[1] != 0 and tmp[1] < bound, result)) return result
def df_datafusion_rasar(db_datafusion, db): grouped_datafusion = db_datafusion.groupby( by=['endpoint', 'effect', 'target']) db_datafusion_rasar = pd.DataFrame() for group in grouped_datafusion.groups: name = group[0] + '_' + group[1] + '_' + str(group[2]) train_X = grouped_datafusion.get_group(group).drop( columns=['endpoint', 'effect', 'target']) test_X = db.copy() train_y = grouped_datafusion.get_group(group)['target'].values knn = KNeighborsClassifier(n_jobs=-1, n_neighbors=1) knn.fit(train_X, train_y) neigh = knn.kneighbors(test_X, return_distance=True) db_datafusion_rasar[name] = neigh[0].ravel() return db_datafusion_rasar
def test(example, population, k): """ Compare your results against the sklearn KNN classifier. This function should create a sklearn KNeighborsClassifier and verify that get_neighbors method returns the same result as the KNeighborsClassifier. >>> example = np.array([[1, 1]]) >>> population = np.array([ [0, 0, 0], # point at coordinate (0, 0) belongs to class 0 [100, 100, 1] # point at coordinate (100, 100) belongs to class 1 ]) >>> # Provided get_neighbors implemented correctly >>> test(example, population, k=1) True """ # YOUR CODE HERE ############################# knn = KNeighborsClassifier() knn.fit(np.array([x[:2] for x in population]), np.array([x[2] for x in population])) neigh = knn.kneighbors(example, k, return_distance=False) res = sorted([x for i, x in enumerate(population) if i in neigh], key=lambda x: x[0]) res2 = sorted(get_neighbors(example, population, k), key=lambda x: x[0]) for x, y in zip(res, res2): if all(x == y): pass else: return False return True
def frienemy_pruning(X_query, X_dsel, y_dsel, ensemble, k): """Implements the Online Pruning method (frienemy) which prunes base classifiers that do not cross the region of competence of a given instance. A classifier crosses the region of competence if it correctly classify at least one sample for each different class in the region. Parameters ---------- X_query : array-like of shape (n_samples, n_features) Test set. X_dsel : array-like of shape (n_samples, n_features) Dynamic selection set. y_dsel : array-like of shape (n_samples,) The target values (Dynamic selection set). ensemble : list of shape = [n_classifiers] The ensemble of classifiers to be pruned. k : int Number of neighbors used to compute the regions of competence. Returns ------- DFP_mask : array-like of shape = [n_samples, n_classifiers] Mask containing 1 for the selected base classifier and 0 otherwise. """ predictions = np.zeros((X_dsel.shape[0], len(ensemble)), dtype=np.intp) for index, clf in enumerate(ensemble): predictions[:, index] = clf.predict(X_dsel) hit_miss = predictions == y_dsel[:, np.newaxis] competence_region = KNeighborsClassifier(n_neighbors=k).fit(X_dsel, y_dsel) neighbors = competence_region.kneighbors(X_query, return_distance=False) return frienemy_pruning_preprocessed(neighbors, y_dsel, hit_miss)
def neighborhood_hit(X, y, k): # X is data, y labels and k number of neighbors logging.info("Computing neighborhood hit") knn = KNeighborsClassifier(n_neighbors=k) knn.fit(X, y) neighbors = knn.kneighbors(X, return_distance=False) return np.mean(np.mean((y[neighbors] == np.tile(y.reshape((-1, 1)), k)).astype('uint8'), axis=1))
def NSC_k_NN(df_treatment, embeds_cols, plot_conf=False, savepath=None): # Create classes for each moa class_dict = dict(zip(df_treatment['moa'].unique(), np.arange(len(df_treatment['moa'].unique())))) df_treatment['moa_class'] = df_treatment['moa'].map(class_dict) # Create nearest neighbors classifier predictions = list() labels = list() label_names = list() for comp in df_treatment['compound'].unique(): df_ = df_treatment.loc[df_treatment['compound'] != comp, :] knn = KNeighborsClassifier(n_neighbors=4, algorithm='brute', metric='cosine') knn.fit(df_.loc[:, embeds_cols], df_.loc[:, 'moa_class']) nn = knn.kneighbors(df_treatment.loc[df_treatment['compound'] == comp, embeds_cols]) for p in range(nn[1].shape[0]): predictions.append(list(df_.iloc[nn[1][p]]['moa_class'])) labels.extend(df_treatment.loc[df_treatment['compound'] == comp, 'moa_class']) label_names.extend(df_treatment.loc[df_treatment['compound'] == comp, 'moa']) predictions = np.asarray(predictions) k_nn_acc = [accuracy_score(labels, predictions[:, 0]), accuracy_score(labels, predictions[:, 1]), accuracy_score(labels, predictions[:, 2]), accuracy_score(labels, predictions[:, 3])] if plot_conf: print('There are {} treatments'.format(len(df_treatment))) print('NSC is: {:.2f}%'.format(accuracy_score(labels, predictions[:, 0]) * 100)) plot_confusion_matrix(labels, predictions[:, 0], class_dict, 'NSC', savepath) return k_nn_acc
def main(): plt.scatter(X[:, 0], X[:, 1], c=y, s=20, cmap='cool') # 此处c=y指的是按照y的分类进行颜色区分,cmap指色彩映射 plt.scatter(c[:, 0], c[:, 1], c='orange', s=50, marker='^') #模型训练 k = 5 clf = KNeighborsClassifier(n_neighbors=k) clf.fit(X, y) #模型预测 X_sample = np.array([[0, 1], [-2, 4], [3, 1]]) #新版本sklearn规定,传入的必须是二维数组,同时我也将其处理为np格式 y_sample = clf.predict(X_sample) print(y_sample) # print(X_sample[:,1]) neighbors = clf.kneighbors(X_sample, return_distance=False) # print(neighbors) plt.scatter(X_sample[:, 0], X_sample[:, 1], c=y_sample, marker='x', cmap='cool') #注意,若给定的测试数据集最终预测类型只有两类,导致颜色错误 for index, element in enumerate(neighbors): #enumrate函数来获取列表中索引的位置 for i in element: # print(index) plt.plot([X[i][0], X_sample[index][0]], [X[i][1], X_sample[index][1]], 'g--', linewidth=0.6) #预测点与距离最近的k个样本的连线 # plt.plot([1,2,3], [1,2,3], 'go-', label='line 1', linewidth=2) plt.show()
def calculateDiscriminant(prototypes, classes, values): d = [] c1 = classes[0] c2 = classes[1] bool_c1 = False bool_c2 = False knn = KNeighborsClassifier(n_neighbors=len(classes)) knn.fit(prototypes, classes) for i in values: dist, kn_index = knn.kneighbors(X=[i], return_distance=True) dist = dist[0] kn_index = kn_index[0] for j in range(len(kn_index)): if ((classes[kn_index[j]] == c1) and not bool_c1): p_c1 = kn_index[j] dist_c1 = dist[j] bool_c1 = True elif ((classes[kn_index[j]] == c2) and not bool_c2): p_c2 = kn_index[j] dist_c2 = dist[j] bool_c2 = True d.append(dist_c1 - dist_c2) bool_c1 = False bool_c2 = False return d
def para_func(arg): num, shape, metric, cnum = arg X = _sharedX centers = choice(X.shape[0], cnum, False) mod = KClass(1, metric=metric) mod.fit(X[centers, :], range(centers.size)) dist, m = mod.kneighbors(X, return_distance=True) return m
def kNNEngine(k,train,test): #create classifier clf= KNeighborsClassifier(k) clf.fit(train, [0]*len(train)) #2nd attribute is arbitrary feature values #get nearest neighbors of my test point x= clf.kneighbors(test) #returns array([distances list],[neighbor index list]) similarity_scores, indices= x[0][0], x[1][0] return similarity_scores, indices
class KNN_c(): def __init__(self, k=5): self.k = k self._model = KNeighborsClassifier(n_neighbors=k) def description(self): return 'KNN %s' % (self.k) def predict_p(self, X_train, y_train, X_test): self._model.fit(X_train, y_train) # Compute empirical probabilities return np.array([np.mean(y_train[self._model.kneighbors(X_test[i,:])[1]]==1) for i in range(X_test.shape[0])])
class RoomClassifier: """ Class to convert fingerprints into a room label Train the classifier using a set of labeled fingerprints by calling fit(). New fingerprints can be labeled using predict(). To recognize fingerprints outside of the calibrated rooms, call predict_outlier() to check whether the fingerprints are outliers. This class is a simple wrapper around sklearn's PCA and KNeighborsClassifier. """ def __init__(self, outlier_threshold=10.0): """ Instantiate room classifier :param outlier_threshold: Threshold in dB for outlier detection """ self.dimred = PCA(n_components=5) self.classifier = KNeighborsClassifier(n_neighbors=5) self.outlier_threshold = outlier_threshold def fit(self, fingerprints, label): """ Train room classifier using labeled fingerprints :param fingerprints: list of fingerprints :param label: list of labels corresponding to fingerprints """ fp = self.dimred.fit_transform(fingerprints) self.classifier.fit(fp, label) def predict(self, fingerprints): """ Predict room label for all fingerprints :param fingerprints: list of fingerprints :return: list of room labels """ fp = self.dimred.transform(fingerprints) return self.classifier.predict(fp) def predict_outlier(self, fingerprints): """ Predict whether the fingerprints are taken in an unlabeled room :param fingerprints: list of fingerprints :return: list of booleans, True if the room is unlabeled, False otherwise """ fp = self.dimred.transform(fingerprints) dist, ind = self.classifier.kneighbors(fp, n_neighbors=1, return_distance=True) return (dist > self.outlier_threshold).reshape(-1)
def smote(X, y, target, family, k=None, sp=np.array([])): """ INPUT: X, y - your data target - the percentage of positive class observations in the output k - k in k nearest neighbors OUTPUT: X_oversampled, y_oversampled - oversampled data `smote` generates new observations from the positive (minority) class: For details, see: https://www.jair.org/media/953/live-953-2037-jair.pdf """ if target <= np.sum([y==family])/float(len(y)): return X, y if k is None: k = len(X)**.5 # fit kNN model knn = KNeighborsClassifier(n_neighbors=k) knn.fit(X[y==family], y[y==family]) neighbors = knn.kneighbors()[0] positive_observations = X[y==family] # determine how many new positive observations to generate positive_count = np.sum([y==family]) negative_count = len(y) - positive_count target_positive_count = target*negative_count / (1. - target) target_positive_count = int(round(target_positive_count)) number_of_new_observations = target_positive_count - positive_count # generate synthetic observations synthetic_observations = np.empty((0, X.shape[1])) while len(synthetic_observations) < number_of_new_observations: obs_index = np.random.randint(len(positive_observations)) observation = positive_observations[obs_index] neighbor_index = np.random.choice(neighbors[obs_index]) neighbor = X[neighbor_index] obs_weights = np.random.random(len(neighbor)) neighbor_weights = 1 - obs_weights new_observation = obs_weights*observation + neighbor_weights*neighbor synthetic_observations = np.vstack((synthetic_observations, new_observation)) X_smoted = np.vstack((X, synthetic_observations)) y_smoted = np.concatenate((y, [family]*len(synthetic_observations))) return X_smoted, y_smoted
def main(noun_to_vect_dict_loc, labels_loc, centroids_loc): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # Load in pickled noun to vector dictionary logger.info('Loading pickled noun to vector dictionary') # Load noun to vector dictionary with open(noun_to_vect_dict_loc, 'rb') as f: noun_to_vect_dict = pickle.load(f) # Create nouns array nouns = np.array(noun_to_vect_dict.keys()) # Create vectors array vectors = noun_to_vect_dict.values() # Create labels array labels = [] # Load in labels logger.info('Loading in labels') with open(labels_loc, 'r') as f: for line in f: labels.append(int(line)) labels = np.array(labels) # Load in pickled centroids logger.info('Loading pickled centroids') with open(centroids_loc, 'rb') as f: centroids = pickle.load(f) # Create empty dictionary for top nouns for a cluster top_nouns_dict = {} # Instantiate and fit kNN model knc = KNeighborsClassifier(n_jobs=-1) knc.fit(vectors, labels) # Get indices of top vectors for i, centroid in enumerate(centroids): # Determine number of representative vectors to get class_size = sum(labels == i) n_neighbors = 50 if class_size >= 50 else class_size # Get indices of n_neighbors vectors nearest to centroid indices = knc.kneighbors(X=centroid, n_neighbors=n_neighbors) # Add top nouns corresponding to those indices to dictionary top_nouns_dict[i] = nouns[indices]
def kNearestNeighbours(X, y, C, L=40, s=""): print "Doing k-nearest neighbours for: " print s minError = 500 bestK = -1 N = len(X) # Cross-validation not necessary. Instead, compute matrix of nearest neighbor # distances between each pair of data points .. knclassifier = KNeighborsClassifier(n_neighbors=L+1, warn_on_equidistant=False).fit(X, y) neighbors = knclassifier.kneighbors(X) # .. and extract matrix where each row contains class labels of subsequent neighbours # (sorted by distance) ndist, nid = neighbors[0], neighbors[1] nclass = y[nid].flatten().reshape(N,L+1) # Use the above matrix to compute the class labels of majority of neighbors # (for each number of neighbors l), and estimate the test errors. errors = np.zeros(L) nclass_count = np.zeros((N,C)) for l in range(1,L+1): for c in range(C): nclass_count[:,c] = sum(nclass[:,1:l+1]==c,1).A.ravel() y_est = np.argmax(nclass_count,1); errors[l-1] = (y_est!=y.A.ravel()).sum() if errors[l-1] < minError: minError = errors[l-1] bestK = l # Plot the classification error rate figure() plot(100*errors/N) xlabel('Number of neighbors') ylabel('Classification error rate (%)') figure() imshow(nclass, cmap='binary', interpolation='None'); xlabel("k'th neighbor"); ylabel('data point'); title("Neighbors class matrix"); show() print '\n' return (bestK, minError / N)
def similar_users(user,genre_arrays,neighbors): ''' Pass: the active user object and the number of neighbors to calculate Returns: An array of similar users, containing the username and the distance to that user on the genre-dimensional plot ''' # if user.pk == 18: # pdb.set_trace() id_array, x_array = genre_arrays copy_id_array = id_array.copy() copy_x_array = x_array.copy() user_id = user.pk id_index = copy_id_array.index(user_id) user_array = copy_x_array[id_index] del copy_id_array[id_index] del copy_x_array[id_index] if len(copy_x_array) < neighbors: neighbors = len(copy_x_array) y_array = [random.random() for x in range(len(copy_x_array))] neigh = KNeighborsClassifier(n_neighbors=neighbors) neigh.fit(copy_x_array, y_array) result = neigh.kneighbors(user_array,neighbors) similar_users = [[copy_id_array[result[1][0][x]],result[0][0][x]] for x in range(neighbors)] return similar_users
def getSimilarArticles(target, numOfDays, numOfNeighbors): articles = app.getTrainingSet(500, 70) neigh = KNeighborsClassifier() count_vect = CountVectorizer(stop_words='english', ngram_range=(1,2)) tfidf_trans = TfidfTransformer() trainingTitle = [x.title for x in articles] trainingLabels = [1 for x in articles] targetTitleCounts = count_vect.fit_transform([target.title]) targetCounts = count_vect.transform([target.text]) + targetTitleCounts trainingCounts = count_vect.transform(trainingTitle) print count_vect.get_feature_names() trainingCountsTfidf = tfidf_trans.fit_transform(trainingCounts) targetCountsTfidf = tfidf_trans.transform(targetTitleCounts) print targetCounts print 'After weighted by tfidf:' targetCounts = targetCounts.multiply(targetCountsTfidf) print targetCounts neigh.fit(trainingCounts, trainingLabels) similar_articles_index = neigh.kneighbors(targetCounts, numOfNeighbors, False) similar_articles = [] for index in similar_articles_index[0]: similar_articles.append(articles[index].title) return similar_articles
knn.fit(X, y) # fit with data knn.predict([3, 5, 4, 2]) # predict for a new observation # predict for multiple observations at once X_new = [[3, 5, 4, 2], [3, 5, 2, 2]] knn.predict(X_new) # try a different value of K knn = KNeighborsClassifier(n_neighbors=5) knn.fit(X, y) knn.predict(X_new) # predictions knn.predict_proba(X_new) # predicted probabilities knn.kneighbors([3, 5, 4, 2]) # distances to nearest neighbors (and identities) # compute the accuracy for K=5 and K=1 # K = 5 knn = KNeighborsClassifier(n_neighbors=5) knn.fit(X, y) knn.score(X, y) # the score function will return the accuracy of your prediction # the number of correct prepdictions / the number of rows # K = 1 knn = KNeighborsClassifier(n_neighbors=1) knn.fit(X, y) knn.score(X, y)
def relieff(X,y, ind, _class, no_iter, nneighbors): df = pd.DataFrame(X) df['class'] = y knn = KNeighborsClassifier(n_neighbors=nneighbors) #group dataframe by class grouped = df.groupby(_class) #extract dataframe groups by_class_dfs = [df for name, df in grouped] #Retrieve selected class dataframe df0 = by_class_dfs.pop(ind) pos_class = df0['class'].iloc[0] #convert dataframe for each group into X matrices of values and y vectors of class labels Xy_list = [df_to_Xy(df) for df in by_class_dfs] #train Knn models knn_models = [] for X, y in Xy_list: knn = KNeighborsClassifier() knn_models.append(knn.fit(X, y)) num_attrs = len(df0.sample().columns)-1 weights = np.zeros(num_attrs) num_classes = len(by_class_dfs) i = 0 while i < no_iter: #print 'i: '+str(i) inst0 = df0.drop('class', axis=1).sample() a = inst0.as_matrix()[0] _df0 = df0.drop(inst0.index) X0, y0 = df_to_Xy(_df0) knn0 = KNeighborsClassifier(n_neighbors=nneighbors) knn0 = knn0.fit(X0, y0) nn_hit_indices = knn0.kneighbors(a.reshape([1,-1]), return_distance=False)[0] nn_misses = [knn.kneighbors(a.reshape([1,-1]), return_distance=False)[0] for knn in knn_models] for j in range(num_attrs): nn_hit = [abs(a[j] - X0[l][j]) for l in nn_hit_indices] nn_hit_val = np.mean(nn_hit) nn_miss = np.zeros(num_classes) #print '\tj: '+str(j) #For each negative class for k in range(num_classes): #printed = False #print '\t\tk: '+str(k) X1 = Xy_list[k][0] #For each neighbor nn_miss_k = [abs(a[j] - X1[l][j]) for l in nn_misses[k]] #if printed == False: #print '\t\t\tl: '+str(l) #printed = True nn_miss[k] = np.mean(nn_miss_k) nn_miss_val = np.mean(nn_miss) weights[j] = weights[j] - nn_hit_val + nn_miss_val i+=1 return weights, pos_class
subdirname = basedir + actions[actionnum] + '/' subdir = os.listdir(subdirname) for seqnum in range(len(subdir)): allMHIs[:,:,allMHIs_counter] = computeMHI(subdirname + subdir[seqnum]) allMoments[allMHIs_counter,:] = huMoments(allMHIs[:,:,allMHIs_counter]) allMHIs_counter += 1 allMoments = allMoments/np.linalg.norm(allMoments) testMoments = allMoments[k,:] allLabels = [i for i in xrange(1,6) for j in xrange(4)] allLabels = np.array(allLabels) neigh = KNeighborsClassifier(n_neighbors=5) #neigh.fit([trainMHI[:,:,i].flatten() for i in xrange(20)], trainLabels) #print(neigh.predict([testMHI.flatten()])) neigh.fit(allMoments, allLabels) dist, ind = neigh.kneighbors(testMoments, n_neighbors=4) fig=plt.figure() ax=fig.add_subplot(3,2,1) ax.set_title("Input") ax.imshow(allMHIs[:,:,k], cmap = cm.Greys_r) for i in xrange(ind.size): bx=fig.add_subplot(3,2,i+3) bx.set_title("Nearest - %d" %i) bx.imshow(allMHIs[:,:,ind[0][i]], cmap = cm.Greys_r) # cv2.imshow("pic %d"%i, allMHIs[:,:,i]) plt.show() if 0xFF & cv2.waitKey(0) == 27: cv2.destroyAllWindows()
#并且选取这140个样本的标签作为训练数据集的标签 iris_x_test = iris_x[indices[-10:]] #剩下的10个样本作为测试数据集 iris_y_test = iris_y[indices[-10:]] #并且把剩下10个样本对应标签作为测试数据及的标签 knn = KNeighborsClassifier() #定义一个knn分类器对象 knn.fit(iris_x_train, iris_y_train) #调用该对象的训练方法,主要接收两个参数:训练数据集及其样本标签 iris_y_predict = knn.predict(iris_x_test) #调用该对象的测试方法,主要接收一个参数:测试数据集 probility=knn.predict_proba(iris_x_test) #计算各测试样本基于概率的预测 neighborpoint=knn.kneighbors(iris_x_test[-1], 5, False) #计算与最后一个测试样本距离在最近的5个点,返回的是这些样本的序号组成的数组 score=knn.score(iris_x_test,iris_y_test,sample_weight=None) #调用该对象的打分方法,计算出准确率 print('iris_y_predict = ') print(iris_y_predict) #输出测试的结果 print('iris_y_test = ') print(iris_y_test) #输出原始测试数据集的正确标签,以方便对比 print('Accuracy:',score ) #输出准确率计算结果 print('neighborpoint of last test sample:',neighborpoint)
from pylab import * from scipy.io import loadmat from sklearn.neighbors import KNeighborsClassifier # requires data from exercise 4.1.1 from ex4_1_1 import * # Maximum number of neighbors L=40 # Cross-validation not necessary. Instead, compute matrix of nearest neighbor # distances between each pair of data points .. knclassifier = KNeighborsClassifier(n_neighbors=L+1).fit(X, ravel(y)) neighbors = knclassifier.kneighbors(X) # .. and extract matrix where each row contains class labels of subsequent neighbours # (sorted by distance) ndist, nid = neighbors[0], neighbors[1] nclass = y[nid].flatten().reshape(N,L+1) # Use the above matrix to compute the class labels of majority of neighbors # (for each number of neighbors l), and estimate the test errors. errors = np.zeros(L) nclass_count = np.zeros((N,C)) for l in range(1,L+1): for c in range(C): nclass_count[:,c] = sum(nclass[:,1:l+1]==c,1).A.ravel() y_est = np.argmax(nclass_count,1); errors[l-1] = (y_est!=y.A.ravel()).sum()
#### initial visualization plt.xlim(0.0, 1.0) plt.ylim(0.0, 1.0) plt.scatter(bumpy_fast, grade_fast, color="b", label="fast") plt.scatter(grade_slow, bumpy_slow, color="r", label="slow") plt.legend() plt.xlabel("bumpiness") plt.ylabel("grade") plt.show() ################################################################################ ### your code here! name your classifier object clf if you want the ### visualization code (prettyPicture) to show you the decision boundary from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score nbrs = KNeighborsClassifier(n_neighbors=25, algorithm='ball_tree').fit(features_train, labels_train) distances, indices = nbrs.kneighbors(features_train) predicted = nbrs.predict(features_test) accuracy = accuracy_score(labels_test, predicted) print accuracy try: prettyPicture(nbrs, features_test, labels_test) except NameError: pass
class MagentoClassifier(object): N_NEIGHBORS = 2 # was 10 KNN_WEIGHTS = 'distance' @staticmethod def test_on_dataset(buildings, test_images_per_building=1, train_images_per_building=-1, class_count=-1, n_neighbors=N_NEIGHBORS, weights=KNN_WEIGHTS, method='mode', iterations=1, seed=-1): """ :type test_images_per_building: int :type train_images_per_building: int :type weights: str :type n_neighbors: int :type buildings: list[Building] :rtype: float """ if method not in METHODS: raise Exception('Pick valid method from ' + str(METHODS)) method_idx = METHODS.index(method) ult_score = np.zeros(len(METHODS)) for iter in range(1, iterations + 1): print_info("Starting testing iteration " + str(iter) + "/" + str(iterations)) train_images, test_images = MagentoClassifier._test_train_split_buildings(buildings, train_images_per_building=train_images_per_building, test_images_per_building=test_images_per_building, class_count=class_count, seed=seed) mc = MagentoClassifier(n_neighbors=n_neighbors, weights=weights) mc.fit(train_images) score = mc.score(test_images, method=method) print_result("Iteration " + str(iter) + "/" + str(iterations) + " score is " + str( score[method_idx]) + " (all scores: " + str(zip(METHODS, score)) + ")") ult_score += score seed += 1 if seed != -1 else 0 print_result("Ultimate score so far is " + str(ult_score[method_idx] / iter) + " (all scores: " + str( zip(METHODS, ult_score / iter)) + ")") score_iterations = ult_score / iterations print_result("Ultimate score is " + str(score_iterations[method_idx]) + " (all scores: " + str( zip(METHODS, score_iterations)) + ")") return score_iterations @staticmethod def _test_train_split_buildings(buildings, test_images_per_building=1, train_images_per_building=-1, class_count=-1, seed=-1): """ :type test_images_per_building: int :type train_images_per_building: int :type buildings: list[Building] """ mapped = map(lambda building: building.get_test_train_images(train_count=train_images_per_building, test_count=test_images_per_building, seed=seed), buildings) random.shuffle(mapped) all_trains, all_tests = [], [] all_classes = 0 for train, test in mapped: if class_count != -1 and all_classes == class_count: break if (len(train) < train_images_per_building) or (len(test) < test_images_per_building): pass else: all_trains.extend(train) all_tests.extend(test) all_classes += 1 if all_classes != class_count: assert class_count == -1, "Database too small" print_warn("There are not enough samples for all classes") print_info( "Loaded " + str(all_classes) + " classes, with " + str(train_images_per_building) + " train and " + str( test_images_per_building) + " test images, with " + str( ("seed=" + str(seed)) if seed != -1 else "default seed")) return all_trains, all_tests def __init__(self, n_neighbors=N_NEIGHBORS, weights=KNN_WEIGHTS): self._classifier = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights) self._buildings = None # type: None|list[Building] self._classes = None # type: None|list[int] self._features_all = None # type: None|list[Feature] def fit(self, images): """ :type images: list[Images] """ print_info("Starting fitting process") assert all([image.get_building() is not None for image in images]) self._buildings = list(set([image.get_building() for image in images])) self._buildings.sort(key=Building.get_identifier) self._classes = map(Building.get_identifier, self._buildings) features_all = [] for image in images: print_result("Processing image " + str(image)) features_all.append(image.get_all_features()) features_all = [feature for features in features_all for feature in features] self._features_all = features_all descriptors_all = np.array([feature.get_descriptor() for feature in features_all]) assert len(descriptors_all.shape) == 2 classes_all = np.array( [feature.get_image().get_building().get_identifier() for feature in features_all]) assert len(classes_all.shape) == 1 assert descriptors_all.shape[0] == classes_all.shape[0] self._classifier.fit(descriptors_all, classes_all) def predict(self, images, method='mode'): """ :type images: list[Image] :rtype: list[int] """ print_info("Starting predict process") size = float(len(images)) data = [(image, self, idx / size, method) for idx, image in enumerate(images)] if CPU_COUNT == 1: return [predict(d) for d in data] else: pool = Pool(CPU_COUNT) results = pool.map(predict, data) pool.close() pool = None gc.collect() return results def score(self, images, method='mode'): """ :type images: list[Image] :rtype: np.ndarray """ print_info("Starting scoring process") y_pred = np.array(self.predict(images, method=method)) y_true = np.array([image.get_building().get_identifier() for image in images])[:, np.newaxis] return np.sum((y_true - y_pred) == 0, axis=0) / float(y_pred.shape[0]) def show_match(self, image_test, descriptors_all): """ :type image_test: Image :type matches: np.array :type distances: np.array """ distances, matches = self._classifier.kneighbors(descriptors_all, return_distance=True, n_neighbors=1) image_test_rgb = image_test.get_rgb() # type: Image for feature, matchs, distancess in zip(image_test.get_all_features(), matches, distances): xy1, w1 = feature.get_global_xy_w() for m in matchs: other_feature = self._features_all[m] image_train_rgb = other_feature.get_image().get_rgb() xy2, w2 = other_feature.get_global_xy_w() offset = image_test_rgb.shape[1] size = offset + image_train_rgb.shape[1] xy2 += [offset, 0] showoff = np.zeros((Image.DEFAULT_HEIGHT, size, 3), np.uint8) showoff[0:image_test_rgb.shape[0], 0:image_test_rgb.shape[1], :] = image_test_rgb showoff[0:image_train_rgb.shape[0], 0 + offset:image_train_rgb.shape[1] + offset, :] = image_train_rgb cv2.line(showoff, tuple(xy1), tuple(xy2), (0, 0, 255), thickness=1) cv2.circle(showoff, tuple(xy1), w1, (0, 0, 255), thickness=1) cv2.circle(showoff, tuple(xy2), w2, (0, 0, 255), thickness=1) plt.imshow(cv2.cvtColor(showoff, cv2.COLOR_RGB2BGR)), plt.show()
# else: # score[l] = 1/dist[i][j] # count[l] = 1 # for key in range(numCategories): # if key in score: # p.append(score[key] / count[key]) # else: # p.append(0) # p = np.array(p) # proba.append(p / np.sum(p)) # prediction.append(np.argmax(p)+1) # find neighbors neighborFile = open(outputDir + "/neighbors.txt", "w") for i in range(len(features["test"])): dist, match = neigh.kneighbors(features["test"][i]) neighborFile.write(str(labels["test"][i]) + " " + str(match[0][0]) + " " + str(dist[0][0]) + "\n") neighborFile.close() # output data file = open(outputFile, "w") file.write("labels ") for c in classes: file.write(str(c) + " ") file.write("\n") for i in range(len(prediction)): l = prediction[i] file.write(str(l) + " ") for p in proba[i]: file.write(str(p) + " ") file.write("\n")
knc = KNeighborsClassifier(n_neighbors=5) knc_1 = [] knc_2 = [] knc_3 = [] knc_4 = [] knc_5 = [] print("Train KNN classifier (step 10/13)") for d in tqdm(range(reduced.shape[0])): dest = destinations.loc[d, 'srch_destination_id'] # Fitting model knc = knc.fit(reduced[destinations.srch_destination_id != dest], destinations.srch_destination_id[destinations. srch_destination_id != dest]) nearest_neighbors = knc.kneighbors(np.reshape( reduced.loc[d, :].as_matrix(), [1, -1]), return_distance=False) nearest_neighbors = nearest_neighbors[0] # For each destination, list of first, second, etc. nearest neighbours. knc_1.append(nearest_neighbors[0]) knc_2.append(nearest_neighbors[1]) knc_3.append(nearest_neighbors[2]) knc_4.append(nearest_neighbors[3]) knc_5.append(nearest_neighbors[4]) # Now we need to match the destinations in the testing set with their # corresponding neighbours. For that we need to create temporary dataframes to # help us merge the results we obtained with the model and the test dataframe. # There might be a more elegant way to do so! print("Updating test dataframe (step 11/13)") temp1 = pd.DataFrame()
# create classifier neigh = KNeighborsClassifier(n_neighbors=3, weights='distance') print np.vstack(inputs).shape print np.vstack(outputs) neigh.fit(np.vstack(inputs), np.vstack(outputs)) # test while True: ret, frame = cap.read() feat = cnn.calculate_features(frame) print neigh.kneighbors(feat) location = np.squeeze(neigh.predict(feat)) print location x = location[0] y = location[1] # show display resized = cv2.resize(frame, (0,0), fx=.25, fy=.25) height, width = resized.shape[:2] img = np.zeros((512,512,3), np.uint8) cv2.circle(img,(x,y),10,(255,0,0),-1) img[-height:,-width:] = resized cv2.imshow('frame',img) if (cv2.waitKey(1) & 0xFF) == ord('q'): break
class project: def __init__(self): self.train_dataframe = pd.read_csv("data/training.csv", header=0) self.test_dataframe = pd.read_csv("data/test.csv", header=0) self.test_dataframe_refId = self.test_dataframe["RefId"] self.preprocess_data() # Initialise classifiers self.attributes = list(self.train_dataframe.columns.values)[1:] self.lsh_neighbours = 2000 self.initialise_knn() self.initialise_pca() self.initialise_svm() self.initialise_nn() self.lsh = lsh.lsh(self.train_dataframe) def preprocess_data(self): print "Preprocessing Data" self.train_dataframe = preprocess.preprocess(self.train_dataframe) self.test_dataframe = preprocess.preprocess(self.test_dataframe) # Add dummy column to test dataframe to match dimensions # Quick hack: should take away self.test_dataframe["IsBadBuy"] = 0 def initialise_knn(self): print "Initialising KNN" k = np.sqrt(self.lsh_neighbours / 2) # k = 150 # Testing self.knn_clf = KNeighborsClassifier(n_neighbors=k) def initialise_pca(self): print "Initialsing PCA" self.pca_clf = PCA(n_components=len(self.attributes) / 2) def initialise_svm(self): print "Initialising SVM" self.svm_clf = svm.SVC(kernel="linear") def initialise_nn(self): print "Initialising Neural Network" num_hidden_nodes = 3 learning_rate = 0.05 batch_size = 30 self.nn_clf = BernoulliRBM(n_components=num_hidden_nodes, learning_rate=learning_rate, batch_size=batch_size) def run(self): predictions = [] refId = [] for idx, row in self.test_dataframe.iterrows(): print "Querying LSH" # query_vector = self.train_dataframe.iloc[1] # Testing query vector query_vector = row lsh_idx = self.lsh.query(query_vector, self.lsh_neighbours) # print lsh_idx print "K Nearest Neighbours" kneighbours = self.k_nearest_neighbours(lsh_idx, query_vector) # For PCA # train_pca, query_pca = self.perform_pca(kneighbours, query_vector) # prediction = self.neural_network(train_pca, query_pca) try: prediction = self.neural_network(self.train_dataframe.ix[kneighbours], query_vector) except: prediction = 0 predictions.append(prediction) refId.append(self.test_dataframe_refId.ix[idx]) # print str(prediction) + " " + str(self.test_dataframe_refId.ix[idx]) # Quick hack for testing """ if idx == 3: break """ self.output_data(predictions, refId) def k_nearest_neighbours(self, lsh_idx, query_vector): """ This function finds num_neighbours k-nearest-neighbours - Default k value: sqrt(num_k_neighbours/2) - Default Distance: Euclidean Reference: http://blog.yhathq.com/posts/classification-using-knn-and-python.html Returns: np.array([]) of row indices of dataframe that are closest to query vector TODO: Graph of accuracy as k increases? Or modify how to calculate distance between points """ lsh_dataframe = self.train_dataframe.ix[lsh_idx] self.knn_clf.fit(lsh_dataframe[self.attributes], lsh_dataframe["IsBadBuy"]) neighbours = self.knn_clf.kneighbors(query_vector[self.attributes], return_distance=False) # print neighbours return neighbours.flatten() def perform_pca(self, kneighbours, query_vector): print "Performing PCA" dataframe = self.train_dataframe.ix[kneighbours] self.pca_clf.fit(dataframe) components = self.pca_clf.components_ train_pca = self.pca_clf.transform(dataframe) query_pca = self.pca_clf.transform(query_vector) return train_pca.flatten(), query_pca.flatten() def neural_network(self, dataframe, query_vector): """ This function trains a neural network based on a PCA transformed dataframe and query vector Using: BernoulliRBM, SVM (because 2 classes) pipeline Output: prediction for query vector """ # Drop the predicted variable which was previously put in as dummy to match indices query_vector = query_vector.drop(["IsBadBuy"]) classifier = Pipeline(steps=[("neural", self.nn_clf), ("svm", self.svm_clf)]) classifier.fit(dataframe[self.attributes], dataframe["IsBadBuy"]) prediction = classifier.predict(query_vector) # print prediction return prediction[0] def output_data(self, predictions, refID): print "Writing to file" array = np.vstack((refID, predictions)) array_transpose = np.array(np.matrix(array).transpose()) df_results = pd.DataFrame({"RefId": array_transpose[:, 0], "Predicted": array_transpose[:, 1]}) df_results.to_csv("results.csv", index=False, cols=["RefId", "Predicted"])
def recommend(data, user, user_conferences, bound=0.4, num_neighbors=20): """ :param data: Data object :param user: Username :param user_conferences: List of visited user conferences :param bound: The bound of distance with the recommendations that are derived :param num_neighbors: Number of neighbors in KNN :return: List of pair (conference, distance) """ # fill mapping to numbers # print('Creates user model') # def make_user_model(): # u_confs = lil_matrix((1, data.members_set.__len__())) # for c in data.list_of_members_with_conferences[user]: # for m in data.list_of_conferences_with_members[c]: # u_confs[0, data.member_to_num[m]] += 1 # return normalize(np.divide(u_confs, data.list_of_members_with_conferences[user].__len__())) # user_model_vector = make_user_model() # fit print('Fit') kn = KNeighborsClassifier(n_neighbors=num_neighbors, weights='distance', metric='minkowski', p=2) matrix = data.to_sparse_matrix() confs_name = ['' for i in range(data.conferences_set.__len__())] for conf in data.conferences_set: confs_name[data.conf_to_num[conf]] = conf kn.fit(matrix, confs_name) # predict print('Creates sparse matrix') user_confs = data.user_confs_to_sparse_matrix(user) # res = sorted(filter(lambda conf: conf not in data.list_of_members_with_conferences[user], # kn.predict(user_confs))) print('Predict') rec_conf_with_dist = {i:0 for i in data.conferences_set} rec_conf_number = {i:0 for i in data.conferences_set} for c in user_conferences: number_of_current_user_conf = data.conf_to_num[c] dst1, ind1 = kn.kneighbors(data.conf_to_vector(c), return_distance=True) conf_with_dist = zip(normalize(dst1[0])[0], ind1[0]) for dist, conf in conf_with_dist: rec_conf_with_dist[data.num_to_conf[conf]] += dist rec_conf_number[data.num_to_conf[conf]] += 1 for conf, number in rec_conf_number.items(): if number != 0 and number != 1: rec_conf_with_dist[conf] /= number result = sorted(rec_conf_with_dist.items(), key=operator.itemgetter(1)) result = list(filter(lambda tmp: tmp[1] != 0 and tmp[1] < bound, result)) return result # dst1, ind1 = kn.kneighbors(user_model_vector, n_neighbors=20, return_distance=True) # return dst1, ind1 # confs_with_simularuty = [] # for conf in data.list_of_members_with_conferences[user]: # conf_lst = lil_matrix((1, data.members_set.__len__())) # for member in data.list_of_conferences_with_members[conf]: # conf_lst[0, data.member_to_num[member]] = 1 # dist, ind = kn.kneighbors(conf_lst, n_neighbors=6, return_distance=True) # dist = dist[0] # ind = [data.num_to_conf[i] for i in ind[0]] # tmp_res = zip(dist, ind) # print(dist) # print(ind) # print(conf) # print('---------') # # for i in range(ind.__len__()): # confs_with_simularuty[ind[i]] = max(dist[i], confs_with_simularuty[data.num_to_conf[ind[i]]]) # return confs_with_simularuty
return i print ("Loading model ...") data = np.loadtxt(os.path.join(args.d, args.features), delimiter = ',') / 255 labels = np.loadtxt(os.path.join(args.d, args.labels), np.uint8) lines = [i.strip().split(" ") for i in open(os.path.join(args.d, args.labelmapping))] labelmap = dict([(int(n), l) for l, n in lines]) clf = KNeighborsClassifier(n_neighbors = 7, algorithm = 'brute') clf.fit(data, labels) img = resize_rgb_image(read_rgb_image(args.i), (32, 32)) d = [] for i, details in normed_windows(img, [1.0], details = True): arr = flatten(i) / 255 arr = np.reshape(arr, (1, -1)) t = clf.predict(arr) c = t[0] print (c, labelmap[c]) distances, idx = clf.kneighbors(arr) distances = [d for d, p in zip(distances.flatten(), idx.flatten()) if labels[p] == c] s = sum(distances) / len(distances) d.append([s, c, details]) d = sorted(d) for s, c, details in d[:3]: print (s, labelmap[c], details) img = box(img, details[1], details[2], details[0]) write_rgb_image(args.o, img)
max_depth=1, random_state=0, loss='ls') for chunk in iter_csv: chunk = chunk.drop(['DateTime'], axis=1) attributes = chunk.columns.values attributes = np.delete(attributes, attributes.tolist().index('TotalDelay')) training_chunk = chunk.iloc[:-1] predict_chunk = chunk.iloc[-1] actual = predict_chunk['TotalDelay'] predict_chunk = predict_chunk.drop(['TotalDelay']) # KNN knn.fit(training_chunk[attributes], training_chunk['TotalDelay']) neighbours = knn.kneighbors(predict_chunk[attributes], return_distance=False).flatten() # Other classifiers new_training_chunk = training_chunk.ix[neighbours] try: gbr.fit(new_training_chunk[attributes], new_training_chunk['TotalDelay']) prediction = gbr.predict(predict_chunk)[0] except: prediction = new_training_chunk['TotalDelay'].mean() print prediction, actual
# Setting number of neighbors = 5 k = 5 # Running KNN model result,neigh = knn(data, test, k) # Predicted class print(result) #-> Iris-virginica # 5 nearest neighbors print(neigh) #-> [141, 139, 120, 145, 144] #Comparing our model with scikit-learn from sklearn.neighbors import KNeighborsClassifier neigh = KNeighborsClassifier(n_neighbors=3) neigh.fit(data.iloc[:,0:4], data['Name']) # Predicted class print(neigh.predict(test)) #-> ['Iris-virginica'] # 3 nearest neighbors print(neigh.kneighbors(test)[1]) #-> [[141 139 120]] """ Reference : https://www.analyticsvidhya.com/blog/2018/03/introduction-k-neighbours-algorithm-clustering/ """
def main(): # open a csv writer c = csv.writer(open(Config.output_path + "candle.csv", "wb")) c.writerow( [ "TICKER", "STRAT_PROFIT", "STRAT_CAGR", "BUY_HOLD", "POS_DAYS", "NEG_DAYS", "TRUE_POS", "TRUE_NEG", "FALSE_POS", "FALSE_NEG", "NUM_TRADES", ] ) tickers = Config.sp500 equities = pd.DataFrame([]) for ticker in tickers: # ohlcva raw_data = load_data(ticker) dates = load_dates(ticker) # feature extraction features = gap_features(raw_data) returns = get_o2c_returns(raw_data, 2) k = 50 neigh = KNeighborsClassifier(n_neighbors=k, weights="uniform", algorithm="brute") dtc = tree.DecisionTreeClassifier() rf = RandomForestClassifier() print "Beginning analysis of: " + ticker.upper() bhold = np.ones(len(returns) + 1) equity = np.ones(len(returns) + 1) equity_second = np.ones(len(returns) + 1) for x in range(len(returns) / 4, len(returns)): # if x == len(returns)/4 or x % 500 == 0: # rf.fit(features[x-1200:x], returns[x-1200:x]) neigh.fit(features[:x], returns[:x]) # predict = rf.predict(features[x]) feature_dist, feature_ind = neigh.kneighbors(features[x]) summation = sum(returns[feature_ind[0]]) stdev = np.std(returns[feature_ind[0]]) if summation / k > 0.001: equity[x + 1] = 1 + returns[x] if summation > stdev: equity_second[x + 1] = 1 + returns[x] bhold[x + 1] = 1 + returns[x] annualRet = annRet(equity.cumprod()[-1], 0.75 * len(returns)) c.writerow([ticker.upper(), equity.cumprod()[-1], annualRet, bhold.cumprod()[-1], equity_second.cumprod()[-1]]) # combine equity curves series = pd.DataFrame({ticker.upper(): equity[1:]}, index=dates[2:]) equities = equities.join(series, how="outer")
# ## Tuning a KNN model # instantiate the model (using the value K=5) knn = KNeighborsClassifier(n_neighbors=5) # fit the model with data knn.fit(X, y) # predict the response for new observations knn.predict(X_new) # calculate predicted probabilities of class membership knn.predict_proba(X_new) # print distances to nearest neighbors (and their identities) knn.kneighbors([3, 5, 4, 2]) # ## Comparing KNN with other models # Advantages of KNN: # - Simple to understand and explain # - Model training is fast # - Can be used for classification and regression! # Disadvantages of KNN: # - Must store all of the training data # - Prediction phase can be slow when n is large # - Sensitive to irrelevant features # - Sensitive to the scale of the data # - Accuracy is (generally) not competitive with the best supervised learning methods