def find_edges(input, test, K): print(f"\tbuilding kNN classifier ... ", end=" ") st_time = time.time() if kNN_type in [1, 2]: input, test = input.todense(), test.todense() if kNN_type == 1: from sklearn.neighbors import NearestNeighbors tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input) elif kNN_type == 2: from scipy import spatial tree = spatial.KDTree(input) elif kNN_type == 3: from n2 import HnswIndex tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2'] for index in tqdm(range(input.shape[0])): tree.add_data(input[index, :]) tree.build(n_threads=20) elif kNN_type == 4: import pysparnn.cluster_index as ci input_num = input.shape[0] tree = ci.MultiClusterIndex(input, range(input_num)) elif kNN_type == 5: import nmslib M, efC = 30, 100 index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post': 0} space_names = ['l2_sparse', 'cosinesimil_sparse'] # https://github.com/nmslib/nmslib/blob/master/manual/spaces.md space_name = space_names[0] data_type = nmslib.DataType.SPARSE_VECTOR tree = nmslib.init(method='hnsw', space=space_name, data_type=data_type) ''' def calc_zero_rows(i): if input[i, :].getnnz() == 0: return 1 else: return 0 pool = Pool(num_threads) zero_row_num = sum(pool.map(calc_zero_rows, range(input.shape[0]))) print(f"# zero rows in input = {zero_row_num}", end=" ") ''' tree.addDataPointBatch(input) tree.createIndex(index_time_params, print_progress=True) # Setting query-time parameters efS = 100 query_time_params = {'efSearch': efS} print('Setting query-time parameters', query_time_params, end=" ") tree.setQueryTimeParams(query_time_params) else: raise NotImplementedError print(f"time={time.time()-st_time:.3f}s") print("\tfinding indices ... ", end=" ") if kNN_type == 1: _, indices = tree.kneighbors(test) elif kNN_type == 2: _, indices = tree.query(test, k=K + 1) elif kNN_type == 3: indices = [] for i in tqdm(range(test.shape[0])): indices.append(tree.search_by_vector(test[i, :], k=K + 1)) elif kNN_type == 4: indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False) elif kNN_type == 5: ''' def calc_zero_rows2(i): if test[i, :].getnnz() == 0: return 1 else: return 0 pool = Pool(num_threads) zero_row_num = sum(pool.map(calc_zero_rows2, range(test.shape[0]))) print(f"# zero rows in test = {zero_row_num}") ''' indices_ = tree.knnQueryBatch(test, k=K+1, num_threads=num_threads) indices = [i[0] for i in indices_] del indices_ else: raise NotImplementedError print(f"time={time.time()-st_time:.3f}s") edge_list = [] for index1, per in enumerate(indices): assert len(per) == K+1, f"index1={index1} len(per)={len(per)} != K={K}" for index2 in per: index2 = int(index2) if index1 != index2: edge_list.append((index1, index2)) print(f"\tget edges done! .... time={time.time()-st_time:.3f}s") return edge_list
class AKNNPredictor: def __init__(self, params): self.logFile = params['logFile'] self.seed = params['seed'] def Train(self, X, Y, numThreads=1): assert (X.shape[0] == Y.shape[0]) if issparse(X): # The python interface of nmslib library most probably does not support sparse input # Use KDTree of sklearn package print( str(datetime.now()) + " : " + "Creating Approximate KNN graph over train examples using sklearn functions" ) self.graph = NearestNeighbors(n_neighbors=10, radius=5, algorithm='auto', metric='l2', n_jobs=numThreads) self.graph.fit(X) else: print( str(datetime.now()) + " : " + "Creating Approximate KNN graph over train examples using HANN" ) self.graph = nmslib.init(method='hnsw', space='l2') self.graph.addDataPointBatch(X) self.graph.createIndex({ 'post': 2, 'M': 10, 'maxM0': 20 }, print_progress=False) self.Y = Y def Predict(self, Xt, nnTest, numThreads=1): # Compute K nearest neighbors for input data print(str(datetime.now()) + " : " + "Computing Approximate KNN") knn = self.ComputeAKNN(Xt, nnTest, numThreads) # Predict labels for input data print(str(datetime.now()) + " : " + "Performing prediction") predYt = self.ComputeLabelScore(knn, nnTest, numThreads) return predYt def ComputeLabelScore(self, KNN, nnTest, numThreads=1): if (KNN.shape[0] == 0): return lil_matrix((0, self.Y.shape[1]), dtype=float) Y = self.Y nt = KNN.shape[0] L = Y.shape[1] batchSize = int(math.ceil(float(nt) / numThreads)) numBatches = int(math.ceil(float(nt) / batchSize)) startIdx = [i * batchSize for i in range(numBatches)] endIdx = [min((i + 1) * batchSize, nt) for i in range(numBatches)] numCores = numThreads resultList = Parallel(n_jobs=numCores)( delayed(ComputeLabelScoreInner)(Y, KNN[s:e, :], nnTest) for s, e in zip(startIdx, endIdx)) predYt = vstack(resultList, format='lil') assert (predYt.shape[0] == nt) return predYt def ComputePrecision(self, predYt, Yt, K, numThreads): assert (predYt.shape == Yt.shape) if (predYt.shape[0] == 0): return np.zeros((K), dtype=float) nt, L = Yt.shape batchSize = int(math.ceil(float(nt) / numThreads)) numBatches = int(math.ceil(float(nt) / batchSize)) startIdx = [i * batchSize for i in range(numBatches)] endIdx = [min((i + 1) * batchSize, nt) for i in range(numBatches)] resultList = Parallel(n_jobs=numThreads)( delayed(ComputePrecisionInner)(predYt[s:e, :], Yt[s:e, :], K) for s, e in zip(startIdx, endIdx)) precision = np.zeros((K, 1)) for i, res in enumerate(resultList): precision += res * (endIdx[i] - startIdx[i]) precision /= float(nt) return precision def ComputeAKNN(self, Xt, nnTest, numThreads=1): if (Xt.shape[0] == 0): return np.zeros((0, nnTest), dtype=np.int64) if (issparse(Xt)): KNN = self.graph.kneighbors(Xt, min(nnTest, Xt.shape[0]), return_distance=False) if (KNN.shape[1] < nnTest): rf = int(nnTest / KNN.shape[1]) KNN = np.hstack(tuple([KNN] * rf)) KNN = np.hstack((KNN, KNN[:, :(nnTest - KNN.shape[1])])) else: neighbors = self.graph.knnQueryBatch(Xt, min(nnTest, Xt.shape[0]), num_threads=numThreads) # Create the KNN matrix KNN = np.zeros((Xt.shape[0], nnTest), dtype=np.int64) for i, nei in enumerate(neighbors): nn = nei[0].shape[1] KNN[i, :nn] = nei[0] if (nn < nnTest): for j in range(nn, nnTest): KNN[i, j] = nei[0][j % nn] return KNN def PredictAndComputePrecision(self, Xt, Yt, nnTestList, maxTestSamples, numThreads): assert (Xt.shape[0] == Yt.shape[0]) # Perform down sampling of input data if (maxTestSamples > 0): Xt, Yt, testSample = DownSampleData(Xt, Yt, maxTestSamples) maxNNTest = max(nnTestList) # Compute K nearest neighbors for input data print(str(datetime.now()) + " : " + "Computing KNN") knn = self.ComputeAKNN(Xt, maxNNTest, numThreads) resList = [] for nnTest in nnTestList: # Predict labels for input data print( str(datetime.now()) + " : " + "Performing prediction for nnTest = " + str(nnTest)) predYt = self.ComputeLabelScore(knn, nnTest, numThreads) # Compute precisions for input data print( str(datetime.now()) + " : " + "Computing precisions for nnTest = " + str(nnTest)) precision = self.ComputePrecision(predYt, Yt, 5, numThreads) #resList.append({'Y': Yt, 'predY': predYt, 'scoreY': scoreYt, 'precision': precision, 'testSample': testSample}) resList.append({'precision': precision}) return resList def UpdateLogFile(self, logFile): self.logFile = logFile def UpdateSeed(self, seed): self.seed = seed
def find_edges(input, test, K, cluster_ids, query_ids): print(f"\tbuilding kNN classifier ... ", end=" ") st_time = time.time() if kNN_type in [1, 2]: input, test = input.todense(), test.todense() if kNN_type == 1: from sklearn.neighbors import NearestNeighbors tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input) elif kNN_type == 2: from scipy import spatial tree = spatial.KDTree(input) elif kNN_type == 3: from n2 import HnswIndex tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2'] for index in tqdm(range(input.shape[0])): tree.add_data(input[index, :]) tree.build(n_threads=20) elif kNN_type == 4: import pysparnn.cluster_index as ci input_num = input.shape[0] tree = ci.MultiClusterIndex(input, range(input_num)) elif kNN_type == 5: import nmslib M, efC, num_threads = 30, 100, 10 index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post': 0} space_name = 'cosinesimil_sparse' data_type = nmslib.DataType.SPARSE_VECTOR tree = nmslib.init(method='hnsw', space=space_name, data_type=data_type) print(f"type(input) = {type(input)} type(test)={type(test)}", end=" ") tree.addDataPointBatch(input) tree.createIndex(index_time_params) # Setting query-time parameters efS = 100 query_time_params = {'efSearch': efS} print('Setting query-time parameters', query_time_params) tree.setQueryTimeParams(query_time_params) else: raise NotImplementedError print(f"time={time.time()-st_time:.3f}s") print("\tfinding indices ... ", end=" ") if kNN_type == 1: _, indices = tree.kneighbors(test) elif kNN_type == 2: _, indices = tree.query(test, k=K + 1) elif kNN_type == 3: indices = [] for i in tqdm(range(test.shape[0])): indices.append(tree.search_by_vector(test[i, :], k=K + 1)) elif kNN_type == 4: indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False) elif kNN_type == 5: indices_ = tree.knnQueryBatch(test, k=K, num_threads=num_threads) indices = [i[0] for i in indices_] del indices_ else: raise NotImplementedError print(f"time={time.time()-st_time:.3f}s") edge_list = [] for index1, per in enumerate(indices): for index2 in per: index2 = int(index2) if index1 != index2: edge_list.append((query_ids[index1], center_ids[index2])) print(f"\tdone! .... time={time.time()-st_time:.3f}s") return edge_list