def Cluster(data, algo, peso=None): #We will use 2 possibilities K-Means and DBSCAN #The first one needs the number of cluster while the second one needs #about 2 parameters eps and min Pts. if algo == "K-Means": #Using the Elbow Method for having the optimal number of clusters print() print("Evaluating number of Clusters") print() #For each possible number of clusters we are gonna #to compute the Sum of Squared Distance into each clusters #then we plot the result thorough Elbow Method centers = [] inertia = [] for i in range(1, 9): kk = KMeans(n_clusters=i, init='k-means++', random_state=0).fit(data) inertia.append(kk.inertia_) centers.append(i) print() print("Make the plot of the ELBOW Method") #Making the Plot of the Elbow Method plt.plot(centers, inertia, 'bx-') plt.xlabel('k') plt.ylabel('Sum_of_squared_distances') plt.title('Elbow Method For Optimal k') plt.savefig("Elbow Method.png", dpi=150, figsize=(12, 6)) #plt.show() plt.close() #K-Means Algorithm #Here we set 5 cause we know that the number of classes analyzed kmeans = KMeans(n_clusters=5, init='k-means++', random_state=0) y_result = kmeans.fit_predict(data) centroidi = kmeans.cluster_centers_ elif algo == "DBSCAN": #DBSCAN Algorithm nbrs = NearestNeighbors(n_neighbors=5).fit(data) distances, indices = nbrs.kneighbors(data) print("The mean distance is about : " + str(np.mean(distances))) #np.median(distances) #dbscan = DBSCAN(eps= 0.0000000005, min_samples= 30700, metric="euclidean", # n_jobs = 1) #dbscan = DBSCAN(eps= 0.000005, min_samples= 700, metric="euclidean", n_jobs = -1) dbscan = DBSCAN(eps=0.003, min_samples=1000, metric="euclidean", n_jobs=-1) print(Counter(peso)) print() y_result = dbscan.fit_predict(data, sample_weight=peso) centroidi = "In DBSCAN there aren not Centroids" return y_result, centroidi
def _upsampling(self, X, y, sigmafactor): """ :param X: :param y: :param sigmafactor: :return: """ k = 10 negative = X[y == self.majority_target] positive = X[y == self.minority_target] size0 = len(negative) size1 = len(positive) newDataNum = round(abs(size0 - size1) * self.ratio) # print(newDataNum) nbrs0 = NearestNeighbors(n_neighbors=k).fit(negative) distances0, indices0 = nbrs0.kneighbors(negative) nbrs1 = NearestNeighbors(n_neighbors=k).fit(positive) distances1, indices1 = nbrs1.kneighbors(positive) gsigma0 = np.mean(distances0, 1) * sigmafactor gsigma1 = np.mean(distances1, 1) * sigmafactor # negative pdf0from0, density0from0 = self.get_pdf_of_points( gsigma0, negative, negative) pdf0from1, density0from1 = self.get_pdf_of_points( gsigma1, positive, negative) # positive pdf1from1, density1from1 = self.get_pdf_of_points( gsigma1, positive, positive) # print(pdf1from1) pdf1from0, density1from0 = self.get_pdf_of_points( gsigma0, negative, positive) # Calculate Posterior Probability confidence0 = self.get_confidence(pdf0from0, pdf0from1, size0, size1) confidence1 = self.get_confidence(pdf1from1, pdf1from0, size1, size0) # search for seed in negative data # Compute confidence ratio for negative data upon new data added to negative data pdf0from0_mat = np.tile(pdf0from0.reshape(-1, 1), (1, size0)) pdf0from0_mat = (pdf0from0_mat * size0 + density0from0) / (size0 + 1) pdf0from0_mat = np.r_[pdf0from0_mat, np.diag(pdf0from0_mat).reshape(1, -1)] pdf0from1_mat = np.tile(pdf0from1.reshape(-1, 1), (1, size0)) pdf0from1_mat = np.r_[pdf0from1_mat, np.diag(pdf0from1_mat).reshape(1, -1)] confidence0_new = self.get_confidence(pdf0from0_mat, pdf0from1_mat, size0 + 1, size1) # Compute confidence ratio for positive data upon new data added to negative data pdf1from0_mat = np.tile(pdf1from0.reshape(-1, 1), (1, size0)) pdf1from0_mat = (pdf1from0_mat * size0 + density1from0) / (size0 + 1) pdf1from1_mat = np.tile(pdf1from1.reshape(-1, 1), (1, size0)) confidence1_new = self.get_confidence(pdf1from1_mat, pdf1from0_mat, size1, size0 + 1) confidence_new_0 = np.r_[confidence0_new, confidence1_new] confidence_old_0 = np.concatenate([ np.r_[np.tile(confidence0.reshape(-1, 1), (1, size0)), confidence0.reshape(1, -1)], np.tile(confidence1.reshape(-1, 1), (1, size0)) ], axis=0) # Relative Certainty Change confidence0_ratio = (confidence_new_0 - confidence_old_0) / confidence_old_0 confidence0_ratio = 0.5 * ( np.mean(confidence0_ratio[0:size0 + 1, :], axis=0) + np.mean(confidence0_ratio[size0 + 1:, :], axis=0)) # Search for seed in positive data # Compute confidence ratio for positive data upon new data added to positive data pdf1from1_mat = np.tile(pdf1from1.reshape(-1, 1), (1, size1)) pdf1from1_mat = (pdf1from1_mat * size1 + density1from1) / (size1 + 1) pdf1from1_mat = np.r_[pdf1from1_mat, np.diag(pdf1from1_mat).reshape(1, -1)] pdf1from0_mat = np.tile(pdf1from0.reshape(-1, 1), (1, size1)) pdf1from0_mat = np.r_[pdf1from0_mat, np.diag(pdf1from0_mat).reshape(1, -1)] confidence1_new = self.get_confidence(pdf1from1_mat, pdf1from0_mat, size1 + 1, size0) # Compute confidence ratio for negative data upon new data added to positive data pdf0from1_mat = np.tile(pdf0from1.reshape(-1, 1), (1, size1)) pdf0from1_mat = (pdf0from1_mat * size1 + density0from1) / (size1 + 1) pdf0from0_mat = np.tile(pdf0from0.reshape(-1, 1), (1, size1)) confidence0_new = self.get_confidence(pdf0from0_mat, pdf0from1_mat, size0, size1 + 1) confidence_new_1 = np.r_[confidence0_new, confidence1_new] confidence_old_1 = np.concatenate([ np.tile(confidence0.reshape(-1, 1), (1, size1)), np.r_[np.tile(confidence1.reshape(-1, 1), (1, size1)), confidence1.reshape(1, -1)] ], axis=0) # Relative Certainty Change confidence1_ratio = (confidence_new_1 - confidence_old_1) / confidence_old_1 confidence1_ratio = 0.5 * ( np.mean(confidence1_ratio[0:size0, :], axis=0) + np.mean(confidence1_ratio[size0:, :], axis=0)) confidence = np.append(confidence0_ratio, confidence1_ratio) X_resampled, y_resampled = self.getNewDataByInterpolationRandomSimplex3( X, y, gsigma0, gsigma1, confidence, newDataNum) return X_resampled, y_resampled
format(i[:10], np.round(density[i[0:10]], 4))) # Plot possible outliers #figure(2) #for k in range(1,21): # subplot(4,5,k) # imshow(np.reshape(X[i[k],:], (16,16)).T, cmap=cm.binary) # xticks([]); yticks([]) # if k==3: title('Gaussian Kernel Density: Possible outliers') ### K-neighbors density estimator # Neighbor to use: K = 5 # Find the k nearest neighbors knn = NearestNeighbors(n_neighbors=K).fit(X) D, i = knn.kneighbors(X) density = 1. / (D.sum(axis=1) / K) # Sort the scores i = density.argsort() density = density[i] # Plot k-neighbor estimate of outlier score (distances) figure(2) bar(range(20), density[:20]) title('KNN density: Outlier score') show() print( 'KNN density. The index of the lowest density object:\n {},\n score:\n {} '
instance_img_path = Config.image_dir + "/fast_mask_roi_3.jpg" instance_output = get_single_patch_feature(instance_img_path, espcn_model) instance_img_path_2 = Config.image_dir + "/fast_mask_roi_6.jpg" instance_output_2 = get_single_patch_feature(instance_img_path_2, espcn_model) feature_list = [] for i, img in enumerate(img_list): img_torch = Variable(img).cuda() feature = extract_crow_feature(img_torch, cls_model) feature_list.append(feature) feature_list = np.array(feature_list) from sklearn.neighbors import NearestNeighbors model = NearestNeighbors(n_neighbors=5, algorithm="ball_tree", metric="minkowski", n_jobs=4, leaf_size=5, p=2) model.fit(feature_list) d2, i2 = model.kneighbors(instance_output.reshape(1, -1)) print d2 d3, i3 = model.kneighbors(instance_output_2.reshape(1, -1)) print d3
def generate_fake_test_from_train_labels(train_seen_label, attribute, seenclasses, unseenclasses, num, per_seen=0.10, \ per_unseen=0.40, per_seen_unseen= 0.50): """ Input: train_seen_label-> images with labels containing objects less than opt.N attribute-> array containing word embeddings seenclasses-> array containing seen class indices unseenclasses-> array containing unseen class indices num-> number of generated synthetic labels Output: gzsl -> tensor containing synthetic labels of only unseen, seen and seen-unseen classes. """ if train_seen_label.min() == 0: print("Training data already trimmed and converted") else: print("original training data received (-1,1)'s ") train_seen_label = torch.clamp(train_seen_label, 0, 1) #remove all zero labeled images while training train_seen_label = train_seen_label[(train_seen_label.sum(1) != 0).nonzero().flatten()] seen_attributes = attribute[seenclasses] unseen_attributes = attribute[unseenclasses] seen_percent, unseen_percent, seen_unseen_percent = per_seen, per_unseen, per_seen_unseen print("seen={}, unseen={}, seen-unseen={}".format(seen_percent, unseen_percent, seen_unseen_percent)) print("syn num={}".format(num)) gzsl = [] for i in range(0, num): new_gzsl_syn_list = [] seen_unseen_label_pairs = {} nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto').fit(unseen_attributes) for seen_idx, seen_att in zip(seenclasses, seen_attributes): _, indices = nbrs.kneighbors(seen_att[None, :]) seen_unseen_label_pairs[seen_idx.tolist()] = unseenclasses[ indices[0][0]].tolist() #ADDING ONLY SEEN LABELS idx = torch.randperm( len(train_seen_label))[0:int(len(train_seen_label) * seen_percent)] seen_labels = train_seen_label[idx] _new_gzsl_syn_list = torch.zeros(seen_labels.shape[0], attribute.shape[0]) _new_gzsl_syn_list[:, :len(seenclasses)] = seen_labels new_gzsl_syn_list.append(_new_gzsl_syn_list) #ADDING ONLY UNSEEN LABELS idx = torch.randperm(len( train_seen_label))[0:int(len(train_seen_label) * unseen_percent)] temp_label = train_seen_label[idx] _new_gzsl_syn_list = torch.zeros(temp_label.shape[0], attribute.shape[0]) for m, lab in enumerate(temp_label): new_lab = torch.zeros(attribute.shape[0]) unseen_lab = lab.nonzero().flatten() u = [] for i in unseen_lab: u.append(seen_unseen_label_pairs[i.tolist()]) new_lab[u] = 1 _new_gzsl_syn_list[m, :] = new_lab unseen_labels = _new_gzsl_syn_list new_gzsl_syn_list.append(unseen_labels) #ADDING BOTH SEEN AND UNSEEN LABELS 50% OF THE SELECTED SEEN LABELS IS MAPPED TO UNSEEN LABELS idx = torch.randperm( len(train_seen_label ))[0:int(len(train_seen_label) * seen_unseen_percent)] temp_label = train_seen_label[idx] _new_gzsl_syn_list = torch.zeros(temp_label.shape[0], attribute.shape[0]) for m, lab in enumerate(temp_label): u = [] new_lab = torch.zeros(attribute.shape[0]) seen_unseen_lab = lab.nonzero().flatten() temp_seen_label = np.random.choice( seen_unseen_lab, int(len(seen_unseen_lab) * 0.50)) u.extend(temp_seen_label) rem_seen_label = np.setxor1d(temp_seen_label, seen_unseen_lab) for i in rem_seen_label: u.append(seen_unseen_label_pairs[i.tolist()]) new_lab[u] = 1 _new_gzsl_syn_list[m, :] = new_lab seen_unseen_labels = _new_gzsl_syn_list new_gzsl_syn_list.append(seen_unseen_labels) new_gzsl_syn_list = torch.cat(new_gzsl_syn_list) gzsl.append(new_gzsl_syn_list) gzsl = torch.cat(gzsl) tmp_list = gzsl.sum(0) ## To make sure every unseen label gets covered empty_lab = torch.arange(tmp_list.numel())[tmp_list == 0] min_uc = int(tmp_list[len(seenclasses):][ tmp_list[len(seenclasses):] > 0].min().item()) for el in empty_lab: idx = torch.randperm(gzsl.size(0))[:min_uc] gzsl[idx, el] = 1 gzsl = gzsl.long() print("GZSL TEST LABELS:", gzsl.shape) return gzsl
def pre_auto_cluster(PhoneValueVector_Chinese, ValueVectorArray_Chi, n_neighbor, plot): value_size, feasure_size = ValueVectorArray_Chi.shape nbrs = NearestNeighbors(n_neighbors=n_neighbor + 1, algorithm='brute', metric='cosine').fit(ValueVectorArray_Chi) knn_matrix = nbrs.kneighbors(ValueVectorArray_Chi, return_distance=False) cosine_dist = (1 - cosine_similarity(ValueVectorArray_Chi)) #local density k_dis_list = [] for q in range(knn_matrix.shape[0]): k_dis = 0 for p in range(1, knn_matrix.shape[1]): dis = cosine_dist[q][knn_matrix[q][p]] k_dis += dis k_dis_mean = (knn_matrix.shape[1] - 1) / (k_dis + 1 ) # sys.float_info.min k_dis_list.append(k_dis_mean) #density base distance min_dist_list = [] dist_matrix = pairwise_distances(ValueVectorArray_Chi, Y=None, metric='cosine') k_dis_sort = sorted(enumerate(k_dis_list), key=lambda x: x[1]) for n in range(len(k_dis_sort)): dist_higher_list = [] for m in range(n + 1, len(k_dis_sort)): dist_higher = dist_matrix[k_dis_sort[n][0]][k_dis_sort[m][0]] dist_higher_list.append({ 'value index': k_dis_sort[n][0], 'shortest index': k_dis_sort[m][0], 'dist': dist_higher }) if len(dist_higher_list) > 0: min_dist = min(dist_higher_list, key=lambda x: x['dist']) min_dist_list.append(min_dist) else: index = dist_matrix[k_dis_sort[n][0]].tolist().index( max(dist_matrix[k_dis_sort[n][0]])) max_dist = ({ 'value index': k_dis_sort[n][0], 'shortest index': index, 'dist': dist_matrix[k_dis_sort[n][0]][index] }) min_dist_list.append(max_dist) ld_dbd = pd.DataFrame(min_dist_list) ld_dbd['local density'] = sorted(k_dis_list) ld_dbd = ld_dbd.sort_values(by='value index').reset_index(drop=True) value_name_list = [] for j in ld_dbd['value index'].tolist(): value_name_list.append(PhoneValueVector_Chinese.loc[j]['value']) ld_dbd['value name'] = value_name_list combine = np.array(k_dis_list) * np.array(ld_dbd['dist'].tolist()) combine_list = ({ 'value index': list(range(combine.shape[0])), 'combine': combine, 'value name': PhoneValueVector_Chinese.loc[list(range(combine.shape[0]))]['value'] }) combine_df = pd.DataFrame(combine_list) combine_df.index = range(len(combine_df)) combine_ = sorted(list(combine)) combine_df = combine_df.sort_values(by='combine', ascending=False).reset_index(drop=True) if plot != 0: #plot to identify the cluster center and size plt.figure(figsize=(14, 10)) plt.scatter(list(range(len(combine_))), combine_) plt.show() diff_list = [] combine_sort = combine_df['combine'].tolist() for q in range(len(combine_df) - 1): diff = abs(combine_sort[q + 1] - combine_sort[q]) diff_list.append(diff) plt.figure(figsize=(14, 8)) plt.bar(list(range(plot)), diff_list[:plot]) plt.show() return ld_dbd, combine_df
def __init__(self, ratio='auto', random_state=None, verbose=True, k=5, m=10, out_step=0.5, kind='regular', n_jobs=-1, **kwargs): """Initialisation of SMOTE object. Parameters ---------- ratio : str or float, optional (default='auto') If 'auto', the ratio will be defined automatically to balanced the dataset. Otherwise, the ratio will corresponds to the number of samples in the minority class over the the number of samples in the majority class. random_state : int or None, optional (default=None) Seed for random number generation. verbose : bool, optional (default=True) Boolean to either or not print information about the processing. k : int, optional (default=5) Number of nearest neighbours to used to construct synthetic samples. m : int, optional (default=10) Number of nearest neighbours to use to determine if a minority sample is in danger. out_step : float, optional (default=0.5) Step size when extrapolating. kind : str, optional (default='regular') The type of SMOTE algorithm to use one of the following options: 'regular', 'borderline1', 'borderline2', 'svm' n_jobs : int, optional (default=-1) Number of threads to run the algorithm when it is possible. """ super(SMOTE, self).__init__(ratio=ratio, random_state=random_state, verbose=verbose) # Check the number of thread to use self.n_jobs = n_jobs # --- The type of smote # This object can perform regular smote over-sampling, borderline 1, # borderline 2 and svm smote. Since the algorithms are fairly simple # they share most methods. possible_kind = ('regular', 'borderline1', 'borderline2', 'svm') if kind in possible_kind: self.kind = kind else: raise ValueError('Unknown kind for SMOTE algorithm.') # --- Verbose # Control whether or not status and progress information should be self.verbose = verbose # --- Nearest Neighbours for synthetic samples # The smote algorithm uses the k-th nearest neighbours of a minority # sample to generate new synthetic samples. self.k = k # --- NN object # Import the NN object from scikit-learn library. Since in the smote # variations we must first find samples that are in danger, we # initialize the NN object differently depending on the method chosen if kind == 'regular': # Regular smote does not look for samples in danger, instead it # creates synthetic samples directly from the k-th nearest # neighbours with not filtering self.nearest_neighbour_ = NearestNeighbors(n_neighbors=k + 1, n_jobs=self.n_jobs) else: # Borderline1, 2 and SVM variations of smote must first look for # samples that could be considered noise and samples that live # near the boundary between the classes. Therefore, before # creating synthetic samples from the k-th nns, it first look # for m nearest neighbors to decide whether or not a sample is # noise or near the boundary. self.nearest_neighbour_ = NearestNeighbors(n_neighbors=m + 1, n_jobs=self.n_jobs) # --- Nearest Neighbours for noise and boundary (in danger) # Before creating synthetic samples we must first decide if # a given entry is noise or in danger. We use m nns in this step self.m = m # --- SVM smote # Unlike the borderline variations, the SVM variation uses the support # vectors to decide which samples are in danger (near the boundary). # Additionally it also introduces extrapolation for samples that are # considered safe (far from boundary) and interpolation for samples # in danger (near the boundary). The level of extrapolation is # controled by the out_step. if kind == 'svm': # Store extrapolation size self.out_step = out_step # Store SVM object with any parameters self.svm_ = SVC(random_state=self.rs_, **kwargs)
'totalRatingCount >= @popularity_threshold') rating_popular_book.head() moviemat = Movies_with_Rating.pivot_table(index='title', columns='user_id', values='rating').fillna(0) moviemat.head() Movie_ratingCount.sort_values('totalRatingCount', ascending=False).head(10) #Filter user_rating_matrix = csr_matrix(moviemat.values) from sklearn.neighbors import NearestNeighbors model_knn = NearestNeighbors(metric='cosine', algorithm='brute') model_knn.fit(user_rating_matrix) query_index = np.random.choice(moviemat.shape[0]) us = np.array(moviemat.iloc[query_index, :]).reshape(1, -1) distances, indices = model_knn.kneighbors(us, n_neighbors=6) x_text = [] for i in range(0, len(distances.flatten())): if i == 0: print('Recommendations for {0}:\n'.format(moviemat.index[query_index])) else: x_text.append(moviemat.index[indices.flatten()[i]]) print('{0}: {1}, with distance of {2}:'.format( i, moviemat.index[indices.flatten()[i]],
def knn_predictor(x_train, y_train, x_test, y_test): clf = NearestNeighbors(n_neighbors = 5) clf.fit(x_train) accuracy = clf.score(x_test, y_test) f1 = precision_recall_fscore_support(y_test, clf.predict(x_test), average = 'weighted')[2] print(accuracy, f1)
def knn_predictor(audio_feats, k=100): """ differences_df = knn_predictor(audio_features) """ # Scale the data with standard scaler scaler = StandardScaler() spotify_scaled = scaler.fit_transform(spotify) ################################################ audio_feats_scaled = scaler.transform([audio_feats]) ## Nearest Neighbors model knn = NearestNeighbors(n_neighbors=k, algorithm='kd_tree') knn.fit(spotify_scaled) # JOBLIB dump dump(knn, 'knn_final.joblib', compress=True) # make prediction prediction = knn.kneighbors(audio_feats_scaled) # create an index for similar songs similar_songs_index = prediction[1][0][:25].tolist() # Create an empty list to store simlar song names similar_song_ids = [] similar_song_names = [] # loop over the indexes and append song names to empty list above for i in similar_songs_index: song_id = identify['track_id'].iloc[i] similar_song_ids.append(song_id) song_name = identify['track_name'].iloc[i] similar_song_names.append(song_name) ################################################# column_names = spotify.columns.tolist() # put scaled audio features into a dataframe audio_feats_scaled_df = pd.DataFrame(audio_feats_scaled, columns=column_names) # create empty list of similar songs' features similar_songs_features = [] # loop through the indexes of similar songs to get audio features for each #. similar song for index in similar_songs_index: list_of_feats = spotify.iloc[index].tolist() similar_songs_features.append(list_of_feats) # scale the features and turn them into a dataframe similar_feats_scaled = scaler.transform(similar_songs_features) similar_feats_scaled_df = pd.DataFrame(similar_feats_scaled, columns=column_names) # get the % difference between the outputs and input songs col_names = similar_feats_scaled_df.columns.to_list() diff_df = pd.DataFrame(columns=col_names) for i in range(25): diff = abs(similar_feats_scaled_df.iloc[i] - audio_feats_scaled_df.iloc[0]) # print('type: ', type(similar_feats_scaled_df.iloc[i])) diff_df.loc[i] = diff # add sums of differences diff_df['sum'] = diff_df.sum(axis=1) diff_df = diff_df.sort_values(by=['sum']) diff_df = diff_df.reset_index(drop=True) # add track_id to DF diff_df['track_id'] = similar_song_ids # reorder cols to have track_id as first column cols = list(diff_df) cols.insert(0, cols.pop(cols.index('track_id'))) diff_df = diff_df.loc[:, cols] # Remove the suggestion of the same song (all 0's) diff_df = diff_df[~(diff_df == 0).any(axis=1)] # Grab only the unique 10 songs diff_df = diff_df.drop_duplicates(subset=['sum'])[:10] diff_df = diff_df.reset_index(drop=True) return diff_df
#allsubspaces=range(max_count-1-7,0,-1) frmt = str(col) + 'b' factor = 1 output = eig_vecs[:, :].dot(X) output = output.T count = 0 for i in allsubspaces: bin_value = str(format(i, frmt)) bin_value = bin_value[::-1] subspace_col = [ col - 2 - index for index, value in enumerate(bin_value) if value == '1' ] print("%d : %s : '%s'" % (i, subspace_col, bin_value[::-1])) np_subspace = output[:, subspace_col] nbrs = NearestNeighbors(n_neighbors=k, algorithm='ball_tree').fit(np_subspace) temp = nbrs.kneighbors_graph(np_subspace).toarray() temp = temp.astype(np.uint64) heidi_matrix = heidi_matrix * 2 + temp temp_array.append([heidi_matrix]) factor = factor * 2 count += 1 max_count = max_count - 1 r = int(max_count / 3) g = int(max_count / 3) b = max_count - r - g x = heidi_matrix >> (max_count - r) y = (heidi_matrix & ((pow(2, g) - 1) << b)) >> b z = (heidi_matrix & (pow(2, b) - 1))
def compute_velocity_on_grid(X_emb, V_emb, density=None, smooth=None, n_neighbors=None, min_mass=None, autoscale=True, adjust_for_stream=False, cutoff_perc=None): # remove invalid cells idx_valid = np.isfinite(X_emb.sum(1) + V_emb.sum(1)) X_emb = X_emb[idx_valid] V_emb = V_emb[idx_valid] # prepare grid n_obs, n_dim = X_emb.shape density = 1 if density is None else density smooth = .5 if smooth is None else smooth grs = [] for dim_i in range(n_dim): m, M = np.min(X_emb[:, dim_i]), np.max(X_emb[:, dim_i]) m = m - .01 * np.abs(M - m) M = M + .01 * np.abs(M - m) gr = np.linspace(m, M, int(50 * density)) grs.append(gr) meshes_tuple = np.meshgrid(*grs) X_grid = np.vstack([i.flat for i in meshes_tuple]).T # estimate grid velocities if n_neighbors is None: n_neighbors = int(n_obs / 50) nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=-1) nn.fit(X_emb) dists, neighs = nn.kneighbors(X_grid) scale = np.mean([(g[1] - g[0]) for g in grs]) * smooth weight = normal.pdf(x=dists, scale=scale) p_mass = weight.sum(1) V_grid = (V_emb[neighs] * weight[:, :, None]).sum(1) / np.maximum( 1, p_mass)[:, None] if min_mass is None: min_mass = 1 if adjust_for_stream: X_grid = np.stack([np.unique(X_grid[:, 0]), np.unique(X_grid[:, 1])]) ns = int(np.sqrt(len(V_grid[:, 0]))) V_grid = V_grid.T.reshape(2, ns, ns) mass = np.sqrt((V_grid**2).sum(0)) min_mass = 10**(min_mass - 6) # default min_mass = 1e-5 min_mass = np.clip(min_mass, None, np.max(mass) * .9) cutoff = mass.reshape(V_grid[0].shape) < min_mass if cutoff_perc is None: cutoff_perc = 5 length = np.sum(np.mean(np.abs(V_emb[neighs]), axis=1), axis=1).T.reshape(ns, ns) cutoff |= length < np.percentile(length, cutoff_perc) V_grid[0][cutoff] = np.nan else: min_mass *= np.percentile(p_mass, 99) / 100 X_grid, V_grid = X_grid[p_mass > min_mass], V_grid[p_mass > min_mass] if autoscale: V_grid /= 3 * quiver_autoscale(X_grid, V_grid) return X_grid, V_grid
ur_test = pandas.read_csv("predictions.dat", '\t') ur_train = ur max_movie_id = len(np.unique(ur["movieID"])) max_user_id = len(np.unique(ur["userID"])) movieMap = np.unique(ur["movieID"]) userMap = np.unique(ur["userID"]) ratingsMatrix = np.ones((max_user_id, max_movie_id)) * 3.7123 for index, row in ur_train.iterrows(): movieIdx = np.where(movieMap == row["movieID"]) userIdx = np.where(userMap == row["userID"]) ratingsMatrix[userIdx, movieIdx] = row["rating"] predictedRatings = np.zeros((2, testInstances)) model = NearestNeighbors() idx = 0 for index, row in ur_test.iterrows(): getRating(row, ratingsMatrix, movieMap, userMap, idx, predictedRatings) idx += 1 print(idx, idx / float(testInstances)) df = pandas.DataFrame(data=predictedRatings.T, columns=["testID", "predicted rating"]) df[['testID']] = df[['testID']].astype(int) df.to_csv("mypredictions.dat", index=False)
filename = "Patients.csv" rows = [] with open(filename, 'r') as csvfile: csvreader = csv.reader(csvfile) for row in csvreader: rows.append(list(map(int, row))) data = np.array(rows) data_nor = StandardScaler().fit_transform(data) dimens = data.shape[1] min_poin = 2 * dimens print("Min_points = ", min_poin) neigh = NearestNeighbors(n_neighbors=min_poin - 1) nbrs = neigh.fit(data_nor) distance, indices = nbrs.kneighbors(data_nor) distance = np.sort(distance, axis=0) distances = distance[:, 1] plt.plot(distances) plt.show() y = distances x = range(1, len(y) + 1) kn = KneeLocator(x, y, curve='convex', direction='increasing') elb = distances[kn.knee - 1] plt.xlabel('number of clusters k') plt.ylabel('Distances(eligible for eplison)') plt.plot(x, y)
features_array[:data.shape[0], 4] = np.array(features_dict['energy']).T features_array[:data.shape[0], 5] = np.array(features_dict['instrumentalness']).T features_array[:data.shape[0], 6] = np.array(features_dict['liveness']).T features_array[:data.shape[0], 7] = np.array(features_dict['loudness']).T features_array[:data.shape[0], 8] = np.array(features_dict['mode']).T features_array[:data.shape[0], 9] = np.array(features_dict['speechiness']).T features_array[:data.shape[0], 10] = np.array(features_dict['tempo']).T features_array[:data.shape[0], 11] = np.array(features_dict['year']).T features_array[data.shape[0], :] = input_ features_array = features_array.astype('float64') model = NearestNeighbors(n_neighbors=5000, algorithm='ball_tree') scalar = StandardScaler() scalar.fit(features_array) features_array = scalar.transform(features_array) input_2 = features_array[data.shape[0], :] model.fit(features_array[:data.shape[0], :]) distances, indices = model.kneighbors([input_2]) recorded_indices = indices closest_1000_point = { 'valence': [], 'year': [], 'popularity': [], 'mode': [], 'acousticness': [],
def fit(self, X, y, training_indices=None): """Specify data to be plotted, and fit classifier only if required (the specified clasifier is only trained if it has not been trained yet). All the input data is provided in the matrix X, and corresponding binary labels (values taking 0 or 1) in the vector y Parameters ---------- X : array-like, shape = [n_samples, n_features] A {n_samples by n_samples} size matrix containing data y : array-like, shape = [n_samples] Labels training_indices : array-like or float, optional (default=None) Indices on which the classifier has been trained / should be trained. If float, it is converted to a random sample with the specified proportion of the full dataset. Returns ------- self : returns an instance of self. """ if set(np.array(y, dtype=int).tolist()) != set([0, 1]): raise Exception( "Currently only implemented for binary classification. Make sure you pass in two classes (0 and 1)" ) if training_indices == None: train_idx = range(len(y)) elif type(training_indices) == float: train_idx, test_idx = train_test_split(range(len(y)), test_size=0.2) else: train_idx = training_indices self.X = X self.y = y self.train_idx = train_idx #self.test_idx = np.setdiff1d(np.arange(len(y)), self.train_idx, assume_unique=False) self.test_idx = list( set(range(len(y))).difference(set(self.train_idx))) # fit classifier if necessary try: self.classifier.predict([X[0]]) except: self.classifier.fit(X[train_idx, :], y[train_idx]) self.y_pred = self.classifier.predict(self.X) # fit DR method if necessary try: self.dimensionality_reduction.transform([X[0]]) except: self.dimensionality_reduction.fit(X, y) try: self.dimensionality_reduction.transform([X[0]]) except: raise Exception( "Please make sure your dimensionality reduction method has an exposed transform() method! If in doubt, use PCA or Isomap" ) # transform data self.X2d = self.dimensionality_reduction.transform(self.X) self.mean_2d_dist = np.mean(pdist(self.X2d)) self.X2d_xmin, self.X2d_xmax = np.min(self.X2d[:, 0]), np.max(self.X2d[:, 0]) self.X2d_ymin, self.X2d_ymax = np.min(self.X2d[:, 1]), np.max(self.X2d[:, 1]) self.majorityclass = 0 if list(y).count(0) > list(y).count(1) else 1 self.minorityclass = 1 - self.majorityclass minority_idx, majority_idx = np.where( y == self.minorityclass)[0], np.where(y == self.majorityclass)[0] self.Xminor, self.Xmajor = X[minority_idx], X[majority_idx] self.Xminor2d, self.Xmajor2d = self.X2d[minority_idx], self.X2d[ majority_idx] # set up efficient nearest neighbor models for later use self.nn_model_2d_majorityclass = NearestNeighbors(n_neighbors=2) self.nn_model_2d_majorityclass.fit(self.X2d[majority_idx, :]) self.nn_model_2d_minorityclass = NearestNeighbors(n_neighbors=2) self.nn_model_2d_minorityclass.fit(self.X2d[minority_idx, :]) # step 1. look for decision boundary points between corners of majority & # minority class distribution minority_corner_idx, majority_corner_idx = [], [] for extremum1 in [np.min, np.max]: for extremum2 in [np.min, np.max]: _, idx = self.nn_model_2d_minorityclass.kneighbors([[ extremum1(self.Xminor2d[:, 0]), extremum2(self.Xminor2d[:, 1]) ]]) minority_corner_idx.append(idx[0][0]) _, idx = self.nn_model_2d_majorityclass.kneighbors([[ extremum1(self.Xmajor2d[:, 0]), extremum2(self.Xmajor2d[:, 1]) ]]) majority_corner_idx.append(idx[0][0]) # optimize to find new db keypoints between corners self._linear_decision_boundary_optimization(minority_corner_idx, majority_corner_idx, all_combinations=True, step=1) # step 2. look for decision boundary points on lines connecting randomly # sampled points of majority & minority class n_samples = int(self.n_connecting_keypoints) from_idx = list( random.sample(list(np.arange(len(self.Xminor))), n_samples)) to_idx = list( random.sample(list(np.arange(len(self.Xmajor))), n_samples)) # optimize to find new db keypoints between minority and majority class self._linear_decision_boundary_optimization(from_idx, to_idx, all_combinations=False, step=2) if len(self.decision_boundary_points_2d) < 2: print( "Failed to find initial decision boundary. Retrying... If this keeps happening, increasing the acceptance threshold might help. Also, make sure the classifier is able to find a point with 0.5 prediction probability (usually requires an even number of estimators/neighbors/etc)." ) return self.fit(X, y, training_indices) # step 3. look for decision boundary points between already known db # points that are too distant (search on connecting line first, then on # surrounding hypersphere surfaces) edges, gap_distances, gap_probability_scores = self._get_sorted_db_keypoint_distances( ) # find gaps self.nn_model_decision_boundary_points = NearestNeighbors( n_neighbors=2) self.nn_model_decision_boundary_points.fit( self.decision_boundary_points) i = 0 retries = 0 while i < self.n_interpolated_keypoints: if self.verbose: print("Step 3/{}:{}/".format(self.steps, i, self.n_interpolated_keypoints)) if self.random_gap_selection: # randomly sample from sorted DB keypoint gaps? gap_idx = np.random.choice(len(gap_probability_scores), 1, p=gap_probability_scores)[0] else: # get largest gap gap_idx = 0 from_point = self.decision_boundary_points[edges[gap_idx][0]] to_point = self.decision_boundary_points[edges[gap_idx][1]] # optimize to find new db keypoint along line connecting two db keypoints # with large gap db_point = self._find_decision_boundary_along_line( from_point, to_point, penalize_tangent_distance=self.penalties_enabled) if self.decision_boundary_distance( db_point) > self.acceptance_threshold: if self.verbose: print( "No good solution along straight line - trying to find decision boundary on hypersphere surface around known decision boundary point" ) # hypersphere radius half the distance between from and to db keypoints R = euclidean(from_point, to_point) / 2.0 # search around either source or target keypoint, with 0.5 probability, # hoping to find decision boundary in between if random.random() > 0.5: from_point = to_point # optimize to find new db keypoint on hypersphere surphase around known keypoint db_point = self._find_decision_boundary_on_hypersphere( from_point, R) if self.decision_boundary_distance( db_point) <= self.acceptance_threshold: db_point2d = self.dimensionality_reduction.transform( [db_point])[0] self.decision_boundary_points.append(db_point) self.decision_boundary_points_2d.append(db_point2d) i += 1 retries = 0 else: retries += 1 if retries > self.hypersphere_max_retry_budget: i += 1 dist = self.decision_boundary_distance(db_point) msg = "Found point is too distant from decision boundary ({}), but retry budget exceeded ({})" print( msg.format(dist, self.hypersphere_max_retry_budget)) elif self.verbose: dist = self.decision_boundary_distance(db_point) print( "Found point is too distant from decision boundary ({}) retrying..." .format(dist)) else: db_point2d = self.dimensionality_reduction.transform( [db_point])[0] self.decision_boundary_points.append(db_point) self.decision_boundary_points_2d.append(db_point2d) i += 1 retries = 0 edges, gap_distances, gap_probability_scores = self._get_sorted_db_keypoint_distances( ) # reload gaps self.decision_boundary_points = np.array(self.decision_boundary_points) self.decision_boundary_points_2d = np.array( self.decision_boundary_points_2d) if self.verbose: print("Done fitting! Found {} decision boundary keypoints.".format( len(self.decision_boundary_points))) return self
# Hyperparameters are described here. parser.add_argument("--n_neighbors", type=int, default=10) parser.add_argument("--metric", type=str, default="cosine") # Sagemaker specific arguments. Defaults are set in the environment variables. parser.add_argument("--output-data-dir", type=str) parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"]) parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) args = parser.parse_args() # Load the training data into a Pandas dataframe and make sure it is in the appropriate format embeddings = pd.read_csv( os.path.join(args.train, "embeddings.csv.tar.gz"), compression="gzip", index_col=False, header=None, ) # Supply the hyperparameters of the nearest neighbors model n_neighbors = args.n_neighbors metric = args.metric # Now, fit the nearest neighbors model nn = NearestNeighbors(n_neighbors=n_neighbors, metric=metric) model_nn = nn.fit(embeddings) print("model has been fitted") # Save the model to the output location in S3 joblib.dump(model_nn, os.path.join(args.model_dir, "model.joblib"))
def _generate_testpoints(self, tries=100): """Generate random demo points around decision boundary keypoints """ nn_model = NearestNeighbors(n_neighbors=3) nn_model.fit(self.decision_boundary_points) nn_model_2d = NearestNeighbors(n_neighbors=2) nn_model_2d.fit(self.decision_boundary_points_2d) #max_radius = 2*np.max([nn_model_2d.kneighbors([self.decision_boundary_points_2d[i]])[0][0][1] for i in range(len(self.decision_boundary_points_2d))]) self.X_testpoints = np.zeros((0, self.X.shape[1])) self.y_testpoints = [] for i in range(len(self.decision_boundary_points)): if self.verbose: msg = "Generating testpoint for plotting {}/{}" print(msg.format(i, len(self.decision_boundary_points))) testpoints = np.zeros((0, self.X.shape[1])) # generate Np points in Gaussian around decision_boundary_points[i] with # radius depending on the distance to the next point d, idx = nn_model.kneighbors([self.decision_boundary_points[i]]) radius = d[0][1] if d[0][1] != 0 else d[0][2] if radius == 0: radius = np.mean(pdist(self.decision_boundary_points_2d)) max_radius = radius * 2 radius /= 5.0 # add demo points, keeping some balance max_imbalance = 5.0 y_testpoints = [] for j in range(self.n_generated_testpoints_per_keypoint - 2): c_radius = radius freq = np.array(np.unique(y_testpoints, return_counts=True)).T.astype(float) imbalanced = freq.shape[0] != 0 if freq.shape[0] == 2 and ( freq[0, 1] / freq[1, 1] < 1.0 / max_imbalance or freq[0, 1] / freq[1, 1] > max_imbalance): imbalanced = True for try_i in range(tries): testpoint = np.random.normal( self.decision_boundary_points[i], radius, (1, self.X.shape[1])) try: testpoint2d = self.dimensionality_reduction.transform( testpoint)[0] except: # DR can fail e.g. if NMF gets negative values testpoint = [] continue # demo point needs to be close to current key point if euclidean( testpoint2d, self.decision_boundary_points_2d[i]) <= max_radius: if not imbalanced: # needs to be not imbalanced break y_pred = self.classifier.predict(testpoint)[0] # imbalanced but this would actually improve things if freq.shape[0] == 2 and freq[y_pred, 1] < freq[1 - y_pred, 1]: break c_radius /= 2.0 if len(testpoint) != 0: testpoints = np.vstack((testpoints, testpoint)) y_testpoints.append(self.classifier.predict(testpoint)[0]) self.X_testpoints = np.vstack((self.X_testpoints, testpoints)) self.y_testpoints = np.hstack((self.y_testpoints, y_testpoints)) self.X_testpoints_2d = self.dimensionality_reduction.transform( self.X_testpoints) idx_within_bounds = np.where( (self.X_testpoints_2d[:, 0] >= self.X2d_xmin) & (self.X_testpoints_2d[:, 0] <= self.X2d_xmax) & (self.X_testpoints_2d[:, 1] >= self.X2d_ymin) & (self.X_testpoints_2d[:, 1] <= self.X2d_ymax))[0] self.X_testpoints = self.X_testpoints[idx_within_bounds] self.y_testpoints = self.y_testpoints[idx_within_bounds] self.X_testpoints_2d = self.X_testpoints_2d[idx_within_bounds]
def auto_cluster(filepath, number_of_cluster, n_neighbor, top_num, plot): PhoneValueVector_Chinese, ValueVectorArray_Chi = gainvector(filepath) ld_dbd, combine_df = pre_auto_cluster(PhoneValueVector_Chinese, ValueVectorArray_Chi, n_neighbor, plot) #开始聚类 value_size, feasure_size = ValueVectorArray_Chi.shape nbrs_ = NearestNeighbors(n_neighbors=20, algorithm='brute', metric='cosine').fit(ValueVectorArray_Chi) knn_matrix_ = nbrs_.kneighbors(ValueVectorArray_Chi, return_distance=False) dist_matrix = pairwise_distances(ValueVectorArray_Chi, Y=None, metric='cosine') cluster = [] centers_ = combine_df['value index'].tolist()[0:number_of_cluster] cluster_len1 = 0 cluster_len2 = 1 value_list = [] while len(cluster) != ValueVectorArray_Chi.shape[ 0] - number_of_cluster and cluster_len1 != cluster_len2: cluster_len1 = len(cluster) for j in range(len(PhoneValueVector_Chinese)): if PhoneValueVector_Chinese.loc[j]['value'] not in [ k['value'] for k in cluster ]: if j not in centers_: near_index = ld_dbd['shortest index'][j] near_value = PhoneValueVector_Chinese.loc[near_index][ 'value'] if near_index in centers_: cluster.append({ 'label': PhoneValueVector_Chinese.loc[near_index]['value'], 'value': PhoneValueVector_Chinese.loc[j]['value'], 'dist': dist_matrix[near_index][j] }) value_list.append( PhoneValueVector_Chinese.loc[near_index]['value']) value_list.append( PhoneValueVector_Chinese.loc[j]['value']) elif len([k for k in cluster if k['value'] == near_value ]) > 0: cluster.append({ 'label': [ k['label'] for k in cluster if k['value'] == near_value ][0], 'value': PhoneValueVector_Chinese.loc[j]['value'], 'dist': dist_matrix[near_index][j] }) value_list.append([ k['label'] for k in cluster if k['value'] == near_value ][0]) value_list.append( PhoneValueVector_Chinese.loc[j]['value']) else: continue cluster_len2 = len(cluster) extra = [ a for a in PhoneValueVector_Chinese['value'].tolist() if a not in list(set(value_list)) ] print('已有聚类结果', len(list(set(value_list))), '未有聚类结果', len(extra), 'total', len(PhoneValueVector_Chinese)) cluster_df = pd.DataFrame(cluster) cluster_df = cluster_df.sort_values(by=['label', 'dist']) result = [] center_top_df = cluster_df.loc[cluster_df['label'].isin( combine_df['value name'][:number_of_cluster])] for i in center_top_df['label'].drop_duplicates(): center_top = center_top_df.loc[center_top_df['label'] == i] cluster_ = center_top['value'].tolist()[:top_num] result.append({ 'center': i, 'cluster': cluster_, 'combine': combine_df.loc[combine_df['value name'] == i]['combine'].tolist() [0] }) result.append({'center': 0, 'cluster': extra, 'combine': 0}) result_df = pd.DataFrame(result) result_df = result_df[['center', 'cluster', 'combine']] result_df = result_df.sort_values(by='combine', ascending=False).reset_index() #return result_df,cluster_df # top k of clusters cluster_size = [0] name_of_label = cluster_df['label'].drop_duplicates().tolist() cluster_df_top = pd.DataFrame() auto_cluster = [] auto_remain = [] for k in name_of_label: #top_num auto_cluster.append(k) cluster_df_top = pd.concat([ cluster_df_top, cluster_df.loc[cluster_df['label'] == k][:top_num] ]) cluster_list = cluster_df.loc[cluster_df['label'] == k]['value'][:top_num].tolist() cluster_remain_list = cluster_df.loc[cluster_df['label'] == k]['value'][top_num:].tolist() for m in cluster_list: auto_cluster.append(m) for n in cluster_remain_list: auto_remain.append(n) cluster_size.append(len(auto_cluster)) return result_df, cluster_df_top
import numpy as np from sklearn.neighbors import NearestNeighbors import torch from PIL import Image import glob import cv2 from torch.nn.functional import interpolate, softmax from torchvision.utils import make_grid from skimage.color import lab2rgb ab_bins = np.load('pts_in_hull.npy') nbrs = NearestNeighbors(n_neighbors=5, algorithm='kd_tree', p=2).fit(ab_bins) ab_bins = torch.from_numpy(ab_bins).cuda().float() def soft_encode_ab(raw_ab): raw_ab = raw_ab.numpy() # Flatten (C, A, H, W) array into (C*H*W, A) array nax = np.setdiff1d(np.arange(0, raw_ab.ndim), np.array((1))) axorder = np.concatenate((nax, np.array(1).flatten()), axis=0) flat_ab = raw_ab.transpose((axorder)).reshape((-1, 2)) # Calculate encoidings for each element distances, indices = nbrs.kneighbors(flat_ab) dist_w = np.exp(-distances**2 / (2 * 5**2))
def affinity(X, algo='lle', n_neighbors=5, epsilon='auto', kernel='rbf', gamma=1, theta=1, lle_diag_fill=False, row_norm=True, n_jobs=-1): """ Compute the affinity matrix. :param X: :param algo: :param n_neighbors: :param epsilon: :param kernel: :param gamma: :param lle_diag_fill: :param row_norm: :param n_jobs: :return: """ algo = algo.lower() assert algo in ['lle', 'epsilon', 'knn', 'kernel'] if algo == 'lle': # locally linear embedding's reconstruction matrix n_neighbors = X.shape[0] knn = NearestNeighbors(n_neighbors + 1, n_jobs=n_jobs).fit( torch.cat([X, torch.zeros(1, X.shape[1])])) X = knn._fit_X n_samples = X.shape[0] - 1 ind = knn.kneighbors(X, return_distance=False)[:-1, 1:] data = _barycenter_weights(X[:-1, :], X[ind]) indptr = np.arange(0, n_samples * n_neighbors + 1, n_neighbors) af_mat = torch.from_numpy( csr_matrix((data.ravel(), ind.ravel(), indptr), shape=(n_samples, n_samples)).todense()).float() if lle_diag_fill: af_mat += torch.eye(n_samples) return af_mat elif algo == 'knn': # k-nearest neighbors if n_neighbors < 0: n_neighbors = X.shape[0] knn = NearestNeighbors(n_neighbors + 1, n_jobs=n_jobs).fit( torch.cat([X, torch.zeros(1, X.shape[1])])) X = knn._fit_X n_samples = X.shape[0] - 1 ind = knn.kneighbors(X, return_distance=False) af_mat = torch.zeros(n_samples, n_samples) for i in range(n_samples): for j in range(min(n_samples + 1, ind.shape[1])): if ind[i, j] < n_samples: if j < n_neighbors: af_mat[i, ind[i, j]] = 1 else: break # af_mat = torch.from_numpy(csr_matrix((data.ravel(), ind.ravel(), indptr), # shape=(n_samples, n_samples)).todense()).float() return _row_norm(af_mat) if row_norm else af_mat elif algo == 'epsilon': # epsilon nearest neighbors n_samples = X.shape[0] knn = NearestNeighbors(n_samples + 1, n_jobs=n_jobs).fit( torch.cat([X, torch.zeros(1, X.shape[1])])) X = knn._fit_X dist, ind = knn.kneighbors(X, return_distance=True) if isinstance(epsilon, str): assert epsilon in ['auto'] epsilon = np.mean(dist) / 2 af_mat = torch.zeros(n_samples, n_samples) for i in range(n_samples): for j in range(n_samples + 1): if ind[i, j] < n_samples and dist[i, j] <= epsilon: af_mat[i, ind[i, j]] = 1 return _row_norm(af_mat) if row_norm else af_mat elif algo == 'kernel': # heat kernel (rbf or laplacian) if isinstance(kernel, str): assert kernel in ['rbf', 'laplacian'] if kernel == 'rbf': kernel = rbf_kernel elif kernel == 'laplacian': kernel = laplacian_kernel # else predefined kernel func af_mat = torch.from_numpy(kernel(X, gamma=gamma)).float() / theta if n_neighbors > 0: mask = affinity(X, algo='knn', n_neighbors=n_neighbors, row_norm=False, n_jobs=n_jobs) mask -= torch.eye(mask.shape[0]) print(af_mat[0]) print(mask[0]) print() af_mat *= mask return _row_norm(af_mat) if row_norm else af_mat
def perform_type_prediction(self, df): def create_binary_type_vector(t_types, a_types): vector = np.zeros(len(all_types)) i = [a_types.index(_) for _ in t_types] vector[i] = 1 return vector def create_binary_type_prediction_vector(t_types, a_types): vector = np.zeros(len(all_types)) i = [a_types.index(_) for _ in itertools.chain.from_iterable(t_types)] vector[i] += 1 return vector # get the types. Mapping from the index of subject to the index of object type_info = ut.deserializer(path=self.p_folder, serialized_name='type_info') # get the index of objects / get type information =>>> s #type o all_types = sorted(set.union(*list(type_info.values()))) # Consider only points with type infos. e_w_types = df.loc[list(type_info.keys())] neigh = NearestNeighbors(n_neighbors=101, algorithm='kd_tree', metric='euclidean', n_jobs=-1).fit( e_w_types) # Get similarity results for selected entities df_most_similars = pd.DataFrame(neigh.kneighbors(e_w_types, return_distance=False)) # Reindex the target df_most_similars.index = e_w_types.index.values # As sklearn implementation of kneighbors returns the point itself as most similar point df_most_similars.drop(columns=[0], inplace=True) # Map back to the original indexes. KNN does not consider the index of Dataframe. mapper = dict(zip(list(range(len(e_w_types))), e_w_types.index.values)) # The values of most similars are mapped to original vocabulary positions df_most_similars = df_most_similars.applymap(lambda x: mapper[x]) k_values = [1, 3, 5, 10, 15, 30, 50, 100] print('K values:',k_values) for k in k_values: print('#####', k, '####') similarities = list() for _, S in df_most_similars.iterrows(): true_types = type_info[_] type_predictions = [type_info[_] for _ in S.values[:k]] vector_true = create_binary_type_vector(true_types, all_types) vector_prediction = create_binary_type_prediction_vector(type_predictions, all_types) sim = cosine(vector_true, vector_prediction) similarities.append(1 - sim) report = pd.DataFrame(similarities) print('Mean type prediction', report.mean().values)
from fuzzywuzzy import fuzz from flask import Flask, request, render_template from scipy.spatial.distance import cosine from surprise import SVD import random # load data movie_user_mat_sparse = pickle.load(open('movie_user_mat_sparse', 'rb')) movie_to_idx = pickle.load(open('movie_to_idx', 'rb')) model = pickle.load(open('model_svd_100', 'rb')) trainset = pickle.load(open('trainset', 'rb')) mname = pickle.load(open('moviename', 'rb')) # fit knn model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=50, n_jobs=-1) model_knn.fit(movie_user_mat_sparse) def fuzzy_2(favs, m=mname): final = [] for i in favs: lst = [] for j in m: ratio = fuzz.ratio(i.lower(), j.lower()) if ratio >= 70: lst.append([j, ratio]) list.sort(lst, key=lambda x: x[1], reverse=True) final.append(lst[0][0])
def _sample(self, X, y): """Resample the dataset. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- X_resampled : ndarray, shape (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new) The corresponding label of `X_resampled` idx_under : ndarray, shape (n_samples, ) If `return_indices` is `True`, a boolean array will be returned containing the which samples have been selected. """ # Assign the parameter of the element of this class # Check that the version asked is implemented if self.version not in [1, 2, 3]: raise ValueError('Parameter `version` must be 1, 2 or 3, got' ' {}'.format(self.version)) # Start with the minority class X_min = X[y == self.min_c_] y_min = y[y == self.min_c_] # All the minority class samples will be preserved X_resampled = X_min.copy() y_resampled = y_min.copy() # Compute the number of cluster needed if self.ratio == 'auto': num_samples = self.stats_c_[self.min_c_] else: num_samples = int(self.stats_c_[self.min_c_] / self.ratio) # If we need to offer support for the indices if self.return_indices: idx_under = np.flatnonzero(y == self.min_c_) # For each element of the current class, find the set of NN # of the minority class # Call the constructor of the NN nn_obj = NearestNeighbors(n_neighbors=self.size_ngh, n_jobs=self.n_jobs, **self.kwargs) # Fit the minority class since that we want to know the distance # to these point nn_obj.fit(X[y == self.min_c_]) # Loop over the other classes under picking at random for key in self.stats_c_.keys(): # If the minority class is up, skip it if key == self.min_c_: continue # Get the samples corresponding to the current class sub_samples_x = X[y == key] sub_samples_y = y[y == key] if self.version == 1: # Find the NN dist_vec, idx_vec = nn_obj.kneighbors( sub_samples_x, n_neighbors=self.size_ngh) # Select the right samples sel_x, sel_y, idx_tmp = self._selection_dist_based( X, y, dist_vec, num_samples, key, sel_strategy='nearest') elif self.version == 2: # Find the NN dist_vec, idx_vec = nn_obj.kneighbors( sub_samples_x, n_neighbors=self.stats_c_[self.min_c_]) # Select the right samples sel_x, sel_y, idx_tmp = self._selection_dist_based( X, y, dist_vec, num_samples, key, sel_strategy='nearest') elif self.version == 3: # We need a new NN object to fit the current class nn_obj_cc = NearestNeighbors(n_neighbors=self.ver3_samp_ngh, n_jobs=self.n_jobs, **self.kwargs) nn_obj_cc.fit(sub_samples_x) # Find the set of NN to the minority class dist_vec, idx_vec = nn_obj_cc.kneighbors(X_min) # Create the subset containing the samples found during the NN # search. Linearize the indexes and remove the double values idx_vec = np.unique(idx_vec.reshape(-1)) # Create the subset sub_samples_x = sub_samples_x[idx_vec, :] sub_samples_y = sub_samples_y[idx_vec] # Compute the NN considering the current class dist_vec, idx_vec = nn_obj.kneighbors( sub_samples_x, n_neighbors=self.size_ngh) sel_x, sel_y, idx_tmp = self._selection_dist_based( sub_samples_x, sub_samples_y, dist_vec, num_samples, key, sel_strategy='farthest') else: raise NotImplementedError # If we need to offer support for the indices selected if self.return_indices: idx_under = np.concatenate((idx_under, idx_tmp), axis=0) X_resampled = np.concatenate((X_resampled, sel_x), axis=0) y_resampled = np.concatenate((y_resampled, sel_y), axis=0) self.logger.info('Under-sampling performed: %s', Counter(y_resampled)) # Check if the indices of the samples selected should be returned too if self.return_indices: # Return the indices of interest return X_resampled, y_resampled, idx_under else: return X_resampled, y_resampled
def nearest_neighbors(values, nbr_neighbors=15): nn = NearestNeighbors(nbr_neighbors, metric='euclidean', algorithm='brute').fit(values) dists, idxs = nn.kneighbors(values) return (dists, idxs)
def knn_prob(X, y, k): """Returns a part of y=1 among the k nearest neighbors.""" knn = NearestNeighbors(n_neighbors=k, n_jobs=24).fit(X) nbh = knn.kneighbors(return_distance=False) return y.reshape(-1)[nbh].mean(1)
def fill_missvalue( self, dataset, value='mean', consider_cat=True, label_based=False): # methods of dealing with missing value if value == 'linear_knn': # select k-nearest neigbors and using them to do linear_regression to fill the missing catagorical_set = [ 3, 22, 24, 30, 31, 47, 52, 56, 66, 71, 74, 75, 79, 91, 107, 110, 112, 113, 125 ] for i in range(len(catagorical_set)): catagorical_set[i] -= 1 missing_stat = stat.Missing_stat().numofmissing_ofrow(dataset) nonMissingValSet = [] missingIndex = missing_stat[0] dataset, target, id = Pandas_dataProcess().label_extract(dataset, id=True) dataset = dataset.values.tolist() for i in missingIndex: nonMissingValSet.append(dataset[i]) print '1' ########################################## KNN ########################## K = 10 filledSet = [] for record in dataset: nonMissTmp = copy(nonMissingValSet) missedIndex = [] fullSetDic = {} for index in range(len(record)): if record[index] == '': missedIndex.append(index) for index in range(len(nonMissTmp)): distance = 0.0 line = nonMissTmp[index] for i in range(len(line)): if i in missedIndex or i in catagorical_set: continue else: distance += pow( (float(record[i]) - float(line[i])), 2) distance = sqrt(distance) fullSetDic[distance] = [] fullSetDic[distance].extend(nonMissTmp[index]) od = collections.OrderedDict(sorted(fullSetDic.items())) topK = [] for i in range(K): key, value = fullSetDic.popitem() topK.append(value) print '2' ########################################## Linear regression ######################## fillIn = [] train_x = [] for row in range(len(topK)): train_x.append([]) for col in range(len(topK[0])): if col in missedIndex: continue else: train_x[-1].append(topK[row][col]) test_x = [] for i in range(len(record)): if record[i] == '': continue else: test_x.append(record[i]) for i in range(len(record)): if record[i] == '': train_y = [] for row in range(len(topK)): train_y.append(topK[row][i]) regr = linear_model.LinearRegression() regr.fit(train_x, train_y) predict_label = regr.predict(test_x) fillIn.append(predict_label) else: fillIn.append(record[i]) filledSet.append(fillIn) print '3' return filledSet if value == 'nn': # choose the nearest neighbor value to fill missing value copydataset = dataset copydataset = self.remove_categorical(copydataset) copydataset = self.fill_missvalue(copydataset, consider_cat=False) listset = array(copydataset.values.tolist()) nbrs = NearestNeighbors(n_neighbors=2, algorithm='auto').fit(listset) distances, indices = nbrs.kneighbors(listset) print indices a = 0 for (dataset_name, dataset_series) in dataset.iteritems(): print a a += 1 nan_num = len(dataset[dataset_series.isnull()]) if nan_num > 0: for index, values in dataset_series.iteritems(): if not dataset.ix[index, dataset_name] < inf: dataset.ix[index, dataset_name] = dataset.ix[ indices[index][1], dataset_name] print dataset return dataset t = 1 if label_based: dataset = dataset.sort(['target'], ascending=False) for (dataset_name, dataset_series) in dataset.iteritems(): print t t += 1 if value == 'mean': # use mean value to fill the missing value nan_num = len(dataset[dataset_series.isnull()]) if nan_num > 0: if label_based: if consider_cat: if dataset_name in self.categorical_set: for (values, times ) in dataset[dataset_name].value_counts( ).iteritems(): mode = values break dataset.loc[dataset_series.isnull(), dataset_name] = mode else: labelbased_mean = [] for (dataset_name2, dataset_series2 ) in dataset.groupby('target'): labelbased_mean.append( dataset_series2[dataset_name].mean()) dataset[:87022].loc[ dataset_series.isnull(), dataset_name] = labelbased_mean[1] dataset[87022:].loc[ dataset_series.isnull(), dataset_name] = labelbased_mean[0] else: dataset.loc[dataset_series.isnull(), dataset_name] = dataset_series.mean() else: if consider_cat: if dataset_name in self.categorical_set: for (values, times ) in dataset[dataset_name].value_counts( ).iteritems(): mode = values break dataset.loc[dataset_series.isnull(), dataset_name] = mode else: dataset.loc[ dataset_series.isnull(), dataset_name] = dataset_series.mean() else: dataset.loc[dataset_series.isnull(), dataset_name] = dataset_series.mean() elif value == 'std': # use standard deviation to fill the missing value nan_num = len(dataset[dataset_series.isnull()]) if nan_num > 0: if consider_cat: if dataset_name in self.categorical_set: for (values, times) in dataset[dataset_name].value_counts( ).iteritems(): mode = values break dataset.loc[dataset_series.isnull(), dataset_name] = mode else: dataset.loc[dataset_series.isnull(), dataset_name] = dataset_series.std() else: dataset.loc[dataset_series.isnull(), dataset_name] = dataset_series.std() elif value == 'mode': # use the mode to fill the missing value nan_num = len(dataset[dataset_series.isnull()]) if nan_num > 0: for (values, times ) in dataset[dataset_name].value_counts().iteritems(): mode = values break dataset.loc[dataset_series.isnull(), dataset_name] = mode else: nan_num = len(dataset[dataset_series.isnull()]) if nan_num > 0: dataset.loc[dataset_series.isnull(), dataset_name] = value return dataset
x = x / 255.0 x = x.reshape(-1, 32, 32, 3) x = x.reshape(-1, 3072) pca = PCA(128) pca.fit(x) x_transformed = pca.transform(x) n = randrange(60000) print(n) query = x_transformed[n] label = y[n] n_neigh = 6 x_transformed = x_transformed.reshape(-1, 128) query = query.reshape(1, 128) nbrs = NearestNeighbors(n_neighbors=n_neigh, n_jobs=-1).fit(x_transformed) distances, indices = nbrs.kneighbors(np.array(query)) n_label_names = [label_names[y[i]] for i in indices] closest_images = x[indices] closest_images = closest_images.reshape(-1, 32, 32, 3) plt.imshow(x[n].reshape(32, 32, 3)) plt.title(label_names[label]) plt.show() plt.figure(figsize=(20, 6)) for i in range(1, n_neigh): # display original ax = plt.subplot(1, n_neigh, i + 1) ax.set_title(n_label_names[0][i]) plt.imshow(closest_images[i].reshape(32, 32, 3)) plt.gray()
def create_detector(self): self.nearest_neighbor = NearestNeighbors(n_neighbors=1, metric='mahalanobis', metric_params={'V': self.cov, 'VI': np.linalg.inv(self.cov)}, algorithm='brute')
vec, vocab, dim = read_vector_file(vec_file) vocab_index = dict() for i in xrange(0, len(vocab)): vocab_index[vocab[i]] = i num_users = len(vocab) print "num users in train sequences", num_users # print "users removed from vocab", len(set(users_train)-set(vocab)) # print "users in test sequences but not in vocab", len(users_test-set(vocab)) # building kd-tree tic = time.clock() # kd = KDTree(vec, leafsize=10) neigh = NearestNeighbors(n_neighbors=5, radius=1.0, algorithm='ball_tree', leaf_size=100, metric='minkowski', p=2) #'ball_tree', 'kd_tree', 'auto' neigh.fit(vec) toc = time.clock() print "ball tree built in", (toc - tic) * 1000 def get_candidate_set(query_set, next_adopters, N): try: query_set_ind = [vocab_index[query] for query in query_set] except KeyError: print "query word not present" return query_vec = [vec[i] for i in query_set_ind] # query using scipy kdtree