def _phase2(self): """ Execute phase 2 of the SP region. This phase is used to compute the active columns. Note - This should only be called after phase 1 has been called and after the inhibition radius and neighborhood have been updated. """ # Shift the outputs self.y[:, 1:] = self.y[:, :-1] self.y[:, 0] = 0 # Calculate k # - For a column to be active its overlap must be above the overlap # value of the k-th largest column in its neighborhood. k = self._get_num_cols() if self.global_inhibition: # The neighborhood is all columns, thus the set of active columns # is simply columns that have an overlap above the k-th largest # in the entire region # Compute the winning column indexes if self.learn: # Randomly break ties ix = bn.argpartsort( -self.overlap[:, 0] - self.prng.uniform(.1, .2, self.ncolumns), k)[:k] else: # Choose the same set of columns each time ix = bn.argpartsort(-self.overlap[:, 0], k)[:k] # Set the active columns self.y[ix, 0] = self.overlap[ix, 0] > 0 else: # The neighborhood is bounded by the inhibition radius, therefore # each column's neighborhood must be considered for i in xrange(self.ncolumns): # Get the neighbors ix = np.where(self.neighbors[i])[0] # Compute the minimum top overlap if ix.shape[0] <= k: # Desired number of candidates is at or below the desired # activity level, so find the overall max m = max(bn.nanmax(self.overlap[ix, 0]), 1) else: # Desired number of candidates is above the desired # activity level, so find the k-th largest m = max(-bn.partsort(-self.overlap[ix, 0], k + 1)[k], 1) # Set the column activity if self.overlap[i, 0] >= m: self.y[i, 0] = True
def _phase2(self): """ Execute phase 2 of the SP region. This phase is used to compute the active columns. Note - This should only be called after phase 1 has been called and after the inhibition radius and neighborhood have been updated. """ # Shift the outputs self.y[:, 1:] = self.y[:, :-1] self.y[:, 0] = 0 # Calculate k # - For a column to be active its overlap must be above the overlap # value of the k-th largest column in its neighborhood. k = self._get_num_cols() if self.global_inhibition: # The neighborhood is all columns, thus the set of active columns # is simply columns that have an overlap above the k-th largest # in the entire region # Compute the winning column indexes if self.learn: # Randomly break ties ix = bn.argpartsort(-self.overlap[:, 0] - self.prng.uniform(.1, .2, self.ncolumns), k)[:k] else: # Choose the same set of columns each time ix = bn.argpartsort(-self.overlap[:, 0], k)[:k] # Set the active columns self.y[ix, 0] = self.overlap[ix, 0] > 0 else: # The neighborhood is bounded by the inhibition radius, therefore # each column's neighborhood must be considered for i in xrange(self.ncolumns): # Get the neighbors ix = np.where(self.neighbors[i])[0] # Compute the minimum top overlap if ix.shape[0] <= k: # Desired number of candidates is at or below the desired # activity level, so find the overall max m = max(bn.nanmax(self.overlap[ix, 0]), 1) else: # Desired number of candidates is above the desired # activity level, so find the k-th largest m = max(-bn.partsort(-self.overlap[ix, 0], k + 1)[k], 1) # Set the column activity if self.overlap[i, 0] >= m: self.y[i, 0] = True
def extractIrIcR(line, d1): # zeroIdx assumes that there is no offset in the axis zeroIdx = round(d1.start*d1.pt/(d1.start-d1.stop)) zeroIdx = 452 # Temp override the autofound value l1 = line[:zeroIdx] l2 = line[zeroIdx:] IrIdx = np.sort(bn.argpartsort(-l1, 3)[:3]) IcIdx = np.sort(bn.argpartsort(-l2, 13)[:13]) + zeroIdx # store only peaks closest to zero Ir = d1.lin[IrIdx[-1]]*1e-6 Ic = d1.lin[IcIdx[1]]*1e-6 R = np.mean(line[IrIdx[:]]) return Ir, Ic, R
def MAP_at_k_batch(train_data, heldout_data, Et, Eb, user_idx, mu=None, k=100, vad_data=None): ''' mean average precision@k ''' batch_users = user_idx.stop - user_idx.start X_pred = _make_prediction(train_data, Et, Eb, user_idx, batch_users, mu=mu, vad_data=vad_data) idx_topk_part = bn.argpartsort(-X_pred, k, axis=1) topk_part = X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk_part[:, :k]] idx_part = np.argsort(-topk_part, axis=1) # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted # topk predicted score idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part] aps = np.zeros(batch_users) for i, idx in enumerate(xrange(user_idx.start, user_idx.stop)): actual = heldout_data[idx].nonzero()[1] if len(actual) > 0: predicted = idx_topk[i] aps[i] = apk(actual, predicted, k=k) else: aps[i] = np.nan return aps
def precision_at_k_batch(train_data, vad_data, test_data, Et, Eb, user_idx, k=20, normalize=True): batch_users = user_idx.stop - user_idx.start X_pred = _make_prediction(train_data, vad_data, Et, Eb, user_idx, batch_users) idx = bn.argpartsort(-X_pred, k, axis=1) X_pred_binary = np.zeros_like(X_pred, dtype=bool) X_pred_binary[np.tile(np.arange(batch_users), (k, 1)).T, idx[:, :k]] = True X_true_binary = (test_data[user_idx] > 0).toarray() tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(np.float32) if normalize: precision = tmp / np.minimum(k, X_true_binary.sum(axis=1)) else: precision = tmp / k return precision
def get_non_elites(self, limit): # get indexes of chromosomes that are not elites indexes = bn.argpartsort(-self.fitnesses, n=self.NUM_ELITES) num_no_elites = self.NUM_CHROMOSOMES - self.NUM_ELITES no_elites = indexes[-num_no_elites:] np.random.shuffle(no_elites) return no_elites[:limit]
def min_k_indices(arr, k, inv_ind=False): """Returns indices of the k-smallest values in each row, unsorted. The `inv_ind` flag returns the tuple (k-smallest,(n-k)-largest). """ psorted = argpartsort(arr, k) if inv_ind: return psorted[..., :k], psorted[..., k:] return psorted[..., :k]
def min_k_indices(arr, k, inv_ind=False): '''Returns indices of the k-smallest values in each row, unsorted. The `inv_ind` flag returns the tuple (k-smallest,(n-k)-largest). ''' psorted = argpartsort(arr, k) if inv_ind: return psorted[..., :k], psorted[..., k:] return psorted[..., :k]
def recall_at_k_batch(train_data, heldout_data, Et1, Eb1, Et2, Eb2, user_idx, k=20, normalize=True, mu=None, vad_data=None): batch_users = user_idx.stop - user_idx.start X_pred = _make_prediction(train_data, Et1, Eb1, Et2, Eb2, user_idx, batch_users, mu=mu, vad_data=vad_data) idx = bn.argpartsort(-X_pred, k, axis=1) X_pred_binary = np.zeros_like(X_pred, dtype=bool) X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True X_true_binary = (heldout_data[user_idx] > 0).toarray() tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(np.float32) recall = tmp / np.minimum(k, X_true_binary.sum(axis=1)) return recall
def eval_multiple(true_scores, pred_scores, topk): idx = bottleneck.argpartsort(-pred_scores, topk)[:topk] noise = np.random.random(topk) if not isinstance(pred_scores, np.ndarray): pred_scores = np.array(pred_scores) if not isinstance(true_scores, np.ndarray): true_scores = np.array(true_scores) rec = sorted(zip(pred_scores[idx], noise, true_scores[idx]), reverse=True) nhits = 0.0 nhits_topk = 0.0 k = topk if topk >= 0 else len(rec) sumap = 0.0 for i in range(len(rec)): if rec[i][-1] != 0.0: nhits += 1.0 if i < k: nhits_topk += 1 sumap += nhits / (i + 1.0) nhits = np.sum(true_scores) if nhits != 0: sumap /= min(nhits, k) map_at_k = sumap recall_at_k = nhits_topk / nhits precision_at_k = nhits_topk / k else: map_at_k = 0.0 recall_at_k = 0.0 precision_at_k = 0.0 return map_at_k, recall_at_k, precision_at_k
def process(self, image): '''Performs llc encoding. ''' K = self.specs.get('k', 5) reg = self.specs.get('reg', 1e-4) D = self.dictionary shape = image.shape[:-1] X = image.reshape((np.prod(shape), image.shape[-1])) # D_norm is the precomputed norm of the entries if 'D_norm' not in self.specs: self.specs['D_norm'] = (D**2).sum(1) / 2. D_norm = self.specs['D_norm'] distance = mathutil.dot(X, -D.T) distance += D_norm # find the K closest indices if bn is not None: # use bottleneck which would be faster IDX = bn.argpartsort(distance, K, axis=1)[:, :K] else: IDX = np.argsort(distance,1)[:, :K] # do LLC approximate coding coeff = np.zeros((X.shape[0], D.shape[0])) ONES = np.ones(K) Z = np.empty((K, D.shape[1])) for i in range(X.shape[0]): # shift to origin Z[:] = D[IDX[i]] Z -= X[i] # local covariance C = mathutil.dot(Z,Z.T) # add regularization C.flat[::K+1] += reg * C.trace() w = np.linalg.solve(C,ONES) coeff[i][IDX[i]] = w / w.sum() return coeff.reshape(shape + (coeff.shape[1],))
def NDCG_binary_at_k_batch(train_data, heldout_data, Et, Eb, user_idx, mu=None, k=100, vad_data=None): ''' normalized discounted cumulative gain@k for binary relevance ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance ''' batch_users = user_idx.stop - user_idx.start X_pred = _make_prediction(train_data, Et, Eb, user_idx, batch_users, mu=mu, vad_data=vad_data) idx_topk_part = bn.argpartsort(-X_pred, k, axis=1) topk_part = X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk_part[:, :k]] idx_part = np.argsort(-topk_part, axis=1) # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted # topk predicted score idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part] # build the discount template tp = 1. / np.log2(np.arange(2, k + 2)) heldout_batch = heldout_data[user_idx] DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis], idx_topk].toarray() * tp).sum(axis=1) IDCG = np.array([(tp[:min(n, k)]).sum() for n in heldout_batch.getnnz(axis=1)]) return DCG / IDCG
def recall_at_k_batch(train_data, heldout_data, Et, Eb, user_idx, k=20, vad_data=None): batch_users = user_idx.stop - user_idx.start X_pred = _make_prediction(train_data, Et, Eb, user_idx, batch_users, vad_data=vad_data) idx = bn.argpartsort(-X_pred, k, axis=1) X_pred_binary = np.zeros_like(X_pred, dtype=bool) X_pred_binary[np.tile(np.arange(batch_users), (k, 1)).T, idx[:, :k]] = True X_true_binary = (heldout_data[user_idx] > 0).toarray() tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(np.float32) recall = tmp / X_true_binary.sum(axis=1) return recall
def newton_optimized(values, k, bottom=False): T = values[:k] prec_avg = weighted_average(T) while True: if bottom: T_next = argpartsort(([vj - prec_avg * wj for (vj, wj) in values]), k)[:k] else: T_next = argpartsort(([prec_avg * wj - vj for (vj, wj) in values]), k)[:k] now_avg = weighted_average( ((values[j][0], values[j][1]) for j in T_next)) if abs(prec_avg - now_avg) < 10**(-9): break prec_avg = now_avg return now_avg
def _knn_euclidean(X, mask, Xc, k, verbose=False): t = time.time() nn_ind = np.zeros((X.shape[0], k), dtype=int) for n in xrange(X.shape[0]): dists = euclidean_distances(Xc[:, mask[n, :]], X[n, mask[n, :]], squared=True).flatten() nn_ind[n, :] = bn.argpartsort(dists, k)[:k] if verbose: print('Finished knn in {:.3f} s'.format(time.time() - t)) return nn_ind
def _knn_dot(X, mask, Xc, k, verbose=False): t = time.time() nn_ind = np.zeros((X.shape[0], k), dtype=int) for n in xrange(X.shape[0]): dists = -np.dot(Xc[:, mask[n, :]], X[n, mask[n, :]]) nn_ind[n, :] = bn.argpartsort(dists, k)[:k] if verbose: print('Finished knn in {:.3f} s'.format(time.time() - t)) return nn_ind
def argsort(x, topn=None): if topn is None: topn = x.size if topn <= 0: return [] if topn >= x.size: return numpy.argsort(x)[::-1] biggest = bottleneck.argpartsort(x, x.size - topn)[-topn:] return biggest.take(numpy.argsort(x.take(biggest))[::-1])
def retrieve(code, k): """ Retrieve top k nearest images from training set """ assert(train_hash_code.shape[0] > k), 'Invalid k' inv_code = 1 - code hamming_dist = 48 - (np.dot(train_hash_code, code) + np.dot(inv_train_hash_code, inv_code)) top_k_idx_unordered = bn.argpartsort(hamming_dist, k)[:k] # get indices of k minimum top_k_hamm_dist = hamming_dist[top_k_idx_unordered] top_k_idx_ordered = top_k_idx_unordered[top_k_hamm_dist.argsort()] return top_k_idx_ordered
def get_top_n(self, probs, labels): ''' Get top n most likely subreddits. Each row in probs should correspond to the subreddit probability distribution over an item in the prediction set. Labels should be subreddit labels which reflect the ordering of probs. Resources: http://stackoverflow.com/questions/10337533/a-fast-way-to-find-the-largest-n-elements-in-an-numpy-array ''' return labels[bn.argpartsort(-probs, self.num_suggestions, axis=1)[:, :self.num_suggestions]]
def get_top_n(self, probs, labels): ''' Get top n most likely subreddits. Each row in probs should correspond to the subreddit probability distribution over an item in the prediction set. Labels should be subreddit labels which reflect the ordering of probs. Resources: http://stackoverflow.com/questions/10337533/a-fast-way-to-find-the-largest-n-elements-in-an-numpy-array ''' return labels[bn.argpartsort(-probs,self.num_suggestions,axis=1)[:,:self.num_suggestions]]
def find_geo_NN(lat, long, location_data, K=1): #location_data is a 2-d nx2 numpy array of lat-long coordinates. v = ((location_data - np.array([lat, long]))**2).sum(axis=1) ix2 = bn.argpartsort(v, K, axis=None)[:K] #print ix #ix2 =ix[ np.nonzero( v[ix] < 100)] #ix2 = np.append( ix2, np.random.randint(0, location_data.shape[0], (1, K - ix2.shape[0] ) ) ) #print ix2 return ix2
def find_geo_NN( lat, long, location_data, K = 1 ): #location_data is a 2-d nx2 numpy array of lat-long coordinates. v = (( location_data - np.array( [lat, long] ) )**2).sum(axis=1) ix2 = bn.argpartsort( v, K, axis=None)[:K] #print ix #ix2 =ix[ np.nonzero( v[ix] < 100)] #ix2 = np.append( ix2, np.random.randint(0, location_data.shape[0], (1, K - ix2.shape[0] ) ) ) #print ix2 return ix2
def argsort(x, topn=None): """Return indices of the `topn` greatest elements in numpy array `x`, in order.""" if topn is None: topn = x.size if topn <= 0: return [] if topn >= x.size: return numpy.argsort(x)[::-1] biggest = bottleneck.argpartsort(x, x.size - topn)[-topn:] # the indices in `biggest` are not sorted by magnitude => sort & return return biggest.take(numpy.argsort(x.take(biggest))[::-1])
def get_rec(self, algname, user, max=10, **param): """Wrapper aroung the real recommendation getter to set parameters""" if algname == 'AsySVD': param = dict(param, userToTest=user) rec = self._get_rec(algname, user, **param) rec = numpy.squeeze(rec) indexes = bottleneck.argpartsort(rec, rec.size - max, axis=0)[-max:] rec = [(rec[index], index) for index in indexes] rec.sort(reverse=True) return rec
def get_rec(self, algname, user, max=10, **param): """Wrapper aroung the real recommendation getter to set parameters""" if algname == 'AsySVD': param = dict(param, userToTest=user) rec = self._get_rec(algname, user, **param) rec = numpy.squeeze(rec) indexes = bottleneck.argpartsort(rec, rec.size-max, axis=0)[-max:] rec = [(rec[index], index) for index in indexes] rec.sort(reverse=True) return rec
def _knn_euclid_helper(self, D): """Helper function to reduce memory consumption. """ dist = distance.cdist(D, self.C, 'sqeuclidean') L = D.shape[0] # number of samples k = self.K # number of nearest neighbours to return k_idx = argpartsort(dist, k, 1)[:, :k] # getting K smallest indices, unordered k_dist = dist[[[t]*k for t in xrange(L)], k_idx] # get distances, unordered idx = np.argsort(k_dist, 1) # get correct ordering k_dist = k_dist[[[t]*k for t in xrange(L)], idx] # apply ordering to distances k_idx = k_idx[[[t]*k for t in xrange(L)], idx] # apply ordering to indices return k_idx, k_dist
def most_similar_to(self, vec): """ Get top num most similar vectors """ vecnorm = NP.sqrt(NP.sum(vec * vec)) numerator = NP.sum(vec.reshape(1, -1) * self.vecs, axis=1) denominator = vecnorm * self.vecsnorm sims = numerator / denominator if 0 < self.simnum < sims.shape[0]: n_idc = BN.argpartsort(-sims, self.simnum, axis=None)[:self.simnum] return NP.array([self.vecs[i] for i in n_idc], dtype=NP.float32), \ NP.array(sims[n_idc], dtype=NP.float32) else: return self.vecs, sims
def CalcSimilarUsersSongs(userid): usersongsset = userDict[userid].keys() usersongintersection = p.DataFrame(index = [userid]) top5similarusers = [] for otheruserid in userDict.iteritems(): otherusersongsset = userDict[otheruserid].keys() usersongintersection.insert(0, otheruserid, len(np.intersect1d(usersongsset,otherusersongsset, False)), False) top5similarusers = usersongintersection.loc[userid][bn.argpartsort(-usersongintersection.loc[userid], 5)[:5]].index.values unlistenedsongs = np.array([]) for userid in top5similarusers: otherusersongsset = userDict[otheruserid].keys() np.union1d(unlistenedsongs, np.setdiff1d(otherusersongsset, usersongsset)) if(len(unlistenedsongs) >= 5): break return unlistenedsongs
def recall_at_k_batch(train_data, heldout_data, Et, Eb, user_idx, k=20, normalize=True, mu=None, vad_data=None): batch_users = user_idx.stop - user_idx.start X_pred = _make_prediction(train_data, Et, Eb, user_idx, batch_users, mu=mu, vad_data=vad_data) idx = bn.argpartsort(-X_pred, k, axis=1) X_pred_binary = np.zeros_like(X_pred, dtype=bool) X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True X_true_binary = (heldout_data[user_idx] > 0).toarray() tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype( np.float32) recall = tmp / np.minimum(k, X_true_binary.sum(axis=1)) return recall
def get_dist_bln(P, Q, A, m, K): """Distance function using Bottleneck. """ Z = (P+Q).dot(A) Z = Z**m Z[Z==0] = 1 D = (P-Q)/Z # calc only diagonal of dot operation, for multiple vectors at a time DA = D.dot(A) sqdist = np.einsum('ij,ji->i', DA, D.T) # squared distance # getting the closest K members k_idx = argpartsort(sqdist, K)[:K] k_dist = sqdist[k_idx] idx = np.argsort(k_dist) k_dist = k_dist[idx] ** 0.5 k_idx = k_idx[idx] return (k_idx, k_dist)
def recall_at_multiple_ks_batch(train_data, heldout_data, Et, Eb, user_idx, topks, vad_data): batch_users = user_idx.stop - user_idx.start X_pred = rec_eval._make_prediction(train_data, Et, Eb, user_idx, batch_users, vad_data=vad_data) recalls = np.empty((len(topks), batch_users)) for i, k in enumerate(topks): idx = bn.argpartsort(-X_pred, k, axis=1) X_pred_binary = np.zeros_like(X_pred, dtype=bool) X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True X_true_binary = (heldout_data[user_idx] > 0).toarray() tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(np.float32) recalls[i] = tmp / X_true_binary.sum(axis=1) return recalls
def get_n_max_sal(sal, n): """ Receives: saliency map n, number of points to return Returns: numpy array with the indices of maximum points """ # max number of candidate indices max_cand = n # distance threshold for distance based cutting out #thresh = 10 sal_ = sal #max_indices = bn.argpartsort(-1*sal_.flatten(), n)[:n] size = sal_.shape[0] * sal.shape[1] #cand_indices = bn.argpartsort(-1*sal_.flatten(), max_cand)[:max_cand] max_indices = bn.argpartsort(-1 * sal_.flatten(), n)[:n] # pdist needs matrix, not 1d array #max_indices = np.reshape(max_indices, (max_indices.shape, 1)) # iterating over max indices to see if there are points too close to each # other. if there is, we must eliminate one of them and pick the next one # after the n max saliency points """ incr = 0 for i in range(n): # obtaining distance matrix dist_mat = squareform(pdist(max_indices, 'minkowski', 1)) for j in range(n): if (dist_mat[i, j] < thres) and (i != j): max_indices[j] = cand_indices[n+incr] incr += 1 """ max_indices = np.unravel_index(max_indices, sal_.shape) print(max_indices) return max_indices
def get_wifi_top(wifi_str='', ntop=15): # 获取wifi强度15的店铺编号 str_list = wifi_str.split(';') wifi_ifos = np.array([x.strip(' ').split('|')[:3] for x in str_list]) w_name = wifi_ifos[:, 0] # 获取列表中的wifi 的名字 w_value = [int(x) for x in wifi_ifos[:, 1]] w_state = wifi_ifos[:, 2] # 连接状态 if 'true' in w_state: connection_wifi_name = w_name[w_state.tolist().index('true')] else: connection_wifi_name = 'unkown' if len(wifi_ifos) > ntop: top_5_idx = bottleneck.argpartsort(-np.array(w_value), ntop)[:ntop] return wf_name_2_idx(w_name[top_5_idx]), wf_name_2_idx([connection_wifi_name]) else: sort_idx = np.argsort(-np.array(w_value)) w_name = w_name[sort_idx].tolist() w_name.extend(['b_null'] * (ntop - len(wifi_ifos))) return wf_name_2_idx(w_name), wf_name_2_idx([connection_wifi_name])
def output(self,outpath,words,z): #topIndices=bn.argpartsort(-self.p_wz_n[:][z]) with open(os.path.join(outpath,str(z)+"-topic.txt"),'w') as outfile: for i in range(z): #print self.p_wz_n.shape #print self.p_wz_n[:,i] topIndices=bn.argpartsort(-self.p_wz_n[:,i],20)[:20] # print topIndices topList=[] for index in topIndices: topList.append([index,self.p_wz_n[index,i]]) sortedList=sorted(topList,key=lambda x:(-x[1])) outfile.write("Topic "+str(i)+":\n") for w in sortedList: outfile.write(words[w[0]+1]+":"+str(w[1])+"\n") with open(os.path.join(outpath,"k-likelihood-time.txt"),'a') as outfile: outfile.write(str(z)+' '+str(self.log_like)+' '+str(self.time)+'\n')
def nn(feat, feats, distance='euclidean', K=-1): """ Exact nearest neighbor seach through exhaustive comparison. """ if distance == 'manhattan': dists = metrics.manhattan_distances(feat, feats) elif distance == 'euclidean': dists = metrics.euclidean_distances(feat, feats, squared=True) elif distance == 'chi_square': dists = -metrics.additive_chi2_kernel(feat, feats) dists = dists.flatten() if K > 0: nn_ind = bn.argpartsort(dists, K).flatten()[:K] nn_ind = nn_ind[np.argsort(dists[nn_ind])] else: nn_ind = np.argsort(dists) nn_dist = dists[nn_ind] return nn_ind, nn_dist
def bn_topargn(arr, N, ascending=None): """ Return the indices of the top N results. The following should be equivalent >>> res1 = arr[bn_topargn(arr, 10)] >>> res2 = bn_topn(arr, 10) >>> np.all(res1 == res2) True """ if arr.ndim > 1: raise Exception("Only works on ndim=1") if ascending is None: ascending = not N > 0 na_mask = np.isnan(arr) has_na = na_mask.sum() if has_na: # store the old indices for translating back later old_index_map = np.where(~na_mask)[0] arr = arr[~na_mask] if N > 0: # nlargest N = len(arr) - abs(N) sl = slice(N, None) else: # nsmallest N = abs(N) sl = slice(None, N) out = nb.argpartsort(arr, N) index = out[sl] # sort the index by their values index_sort = np.argsort(arr[index]) if not ascending: index_sort = index_sort[::-1] index = index[index_sort] # index is only correct with arr without nans. # Map back to old_index if needed if has_na: index = old_index_map[index] return index
def keep_k_best(co_occ, k=200): """ Keep the ``k`` best values in the matrix and set the rest to 0. Relies on the bottleneck library for fast sort. Args: * ``co_occ`` (*ndarray*): input matrix. * ``k`` (*int, optional*): number of values to keep. Defaults to 200. Returns: * ``normalized`` (*ndarray*): normalized matrix. """ import bottleneck as bn part = bn.argpartsort(-co_occ, k, axis=1)[:, :k] for line in xrange(co_occ.shape[0]): c = co_occ[line, :] kbest = c[part[line, -1]] c[c < kbest] = 0. co_occ[line, :] = c return co_occ
def get_knn(self, D): """Indexes and distances of *k* nearest neighbours. Uses *argpartsort* for fast determination of kNN. Currently used. """ if self.C is None: return "Uninitialized centroids" dist = distance.cdist(D, self.C, 'sqeuclidean') L = D.shape[0] # number of samples k = cf._nn_count # number of nearest neighbours k_idx = argpartsort(dist, k, 1)[:, :k] # getting K smallest indices, unordered k_dist = dist[[[t]*k for t in xrange(L)], k_idx] # get distances, unordered idx = np.argsort(k_dist, 1) # get correct ordering k_dist = k_dist[[[t]*k for t in xrange(L)], idx] # apply ordering to distances k_idx = k_idx[[[t]*k for t in xrange(L)], idx] # apply ordering to indices return (k_idx, k_dist)
def predict(self, testDat): pred = np.zeros(testDat.shape[0]) for j,t in enumerate(testDat): distances = cdist(t[np.newaxis,:], self.X, 'euclidean').ravel() index = bn.argpartsort(distances, n=self.k) label = self.y[index[:self.k]] votes = {} for i in label: if i in votes: votes[i] += 1 else: votes[i] = 1 pred[j] = max(votes.iteritems(), key=operator.itemgetter(1))[0] return pred
def detect_line(img): img_edge = edge_detect.full_detect(img, is_binary=False) sum_x = np.sum(img_edge, axis=0) # max_xs = bn.argpartsort(-sum_x, 10)[:10] max_xs = sum_x.argsort()[-10:] sum_y = np.sum(img_edge, axis=1) max_ys = bn.argpartsort(-sum_y, 10)[:10] height, width = img_edge.shape[:2] for x in max_xs: # print (x, 0), (x, height) cv2.line(img, (x, 0), (x, height), (255, 255, 255), 1) for y in max_ys: # print (0, y), (width, y) cv2.line(img, (0, y), (width, y), (255, 255, 255), 1) return img
def process(self, image, out=None): '''Performs llc encoding. ''' K = self.specs.get('k', 5) reg = self.specs.get('reg', 1e-4) D = self.dictionary shape = image.shape[:-1] X = image.reshape((np.prod(shape), image.shape[-1])) # D_norm is the precomputed norm of the entries if 'D_norm' not in self.specs: self.specs['D_norm'] = (D**2).sum(1) / 2. D_norm = self.specs['D_norm'] distance = mathutil.dot(X, -D.T) distance += D_norm # find the K closest indices if bn is not None: # use bottleneck which would be faster IDX = bn.argpartsort(distance, K, axis=1)[:, :K] else: IDX = np.argsort(distance,1)[:, :K] # do LLC approximate coding if out is None: out = np.zeros((X.shape[0], D.shape[0])) else: out.resize((X.shape[0], D.shape[0])) out[:] = 0 ONES = np.ones(K) Z = np.empty((K, D.shape[1])) for i in range(X.shape[0]): # shift to origin Z[:] = D[IDX[i]] Z -= X[i] # local covariance C = mathutil.dot(Z,Z.T) # add regularization C.flat[::K+1] += reg * C.trace() w = np.linalg.solve(C,ONES) out[i][IDX[i]] = w / w.sum() out.resize(shape + (out.shape[1],)) return out
def nn(feat, feats, distance='euclidean', K=-1): """ Exact nearest neighbor seach through exhaustive comparison. """ if distance == 'manhattan': dists = metrics.manhattan_distances(feat, feats) elif distance == 'euclidean': dists = metrics.euclidean_distances(feat, feats, squared=True) elif distance == 'chi_square': dists = -metrics.additive_chi2_kernel(feat, feats) elif distance == 'dot': dists = -np.dot(feat, feats) dists = dists.flatten() if K > 0: nn_ind = bn.argpartsort(dists, K).flatten()[:K] nn_ind = nn_ind[np.argsort(dists[nn_ind])] else: nn_ind = np.argsort(dists) nn_dist = dists[nn_ind] return nn_ind, nn_dist
def brute_radius_search(self, v, radius2=None, limit=None): v = v.flatten().astype(self._data_dtype) v_norm2 = bottleneck.ss(v) # same as sum(v * v) d_norm2 = self.get_dataset('norm2', mmap_mode='r') dists = d_norm2 + v_norm2 - 2 * np.dot(self.data, v) #assert dists.ndim == 1 and not bottleneck.anynan(dists) ids = self.ids if radius2: mask = (dists < radius2) dists = dists[mask] ids = ids[mask] if limit: if limit == 1: imin = np.argmin(dists) return [(dists[imin], ids[imin])] else: # limit to the smallest values smallest_indices = bottleneck.argpartsort(dists, limit)[:limit] dists = dists[smallest_indices] ids = ids[smallest_indices] order = np.argsort(dists) return [(dists[i], ids[i]) for i in order]
def CalcSimilarSongs(userid): totalplaycount,meanplaycount = 0,0; usersongs = userDict[userid] totalnumofsongs = len(usersongs) for songid,pc in usersongs.iteritems(): totalplaycount = totalplaycount + pc meanplaycount = totalplaycount/totalnumofsongs highestnormalizedpc = 0.00 for songid, pc in usersongs.iteritems(): usersongs[songid] = pc/meanplaycount if(usersongs[songid] > highestnormalizedpc): highestnormalizedpc = usersongs[songid] highnormpcsongid = songid top5similarsongs = songdataframe.loc[highnormpcsongid][bn.argpartsort(-songdataframe.loc[highnormpcsongid], 5)[:5]].index.values return top5similarsongs
def solve(X_train, y_train, X_test, y_test): """ The basic genetic algorithm. :return: An uncompiled keras model. The best found Neural Network for the data. """ # Generate pop_size number of random solutions. Solutions may be infeasible population = generate_population(pop_size, 136) for generation in trange(num_gen): # Calculate the "goodness" of each solution and give it a score fitness_scores = evaluate_fitness(population, X_train, y_train, X_test, y_test) # Introduce concept of elitism. In each generation, num_elite number of best solutions will be chosen to be # carried forward to the next iteration without any modifications to them. Instead of sorting entire array, # sort it partially so that only top num_elite number of solutions are sorted, and the rest remains same if num_elite != 0: part_sorted = bn.argpartsort(fitness_scores, fitness_scores.shape[0] - num_elite) elite_indices = part_sorted[-num_elite:] # Sorted top num_elite no of solutions remaining_indices = part_sorted[:-num_elite] # Unsorted solutions # Print the best fitness score of every 20 generations to see how algorithm is performing. if generation % 10 == 0: print(np.max(fitness_scores)) # Select parents which will create the next generation if num_elite != 0: parents = select_parents(population[remaining_indices], fitness_scores[remaining_indices]) else: parents = select_parents(population, fitness_scores) # Perform a crossover operation on selected parents children = crossover(parents, xover_rate) # Perform a mutation operation on crossovered parents mutate(children, mut_rate) if num_elite != 0: # Add the elite solutions that were removed from parents, and set them as population (parents) for next gen population = np.vstack((children, population[elite_indices])) else: population = children # Find solution with highest fitness scores, i.e. solution with least number of vertices. best_solution_index = np.argmax(fitness_scores) return population[best_solution_index]
def calc(): if(k<dist_local.shape[0]): index_local = bn.argpartsort(dist_local,k,axis=0)[:k,:] for r in xrange(stop_point-ii): dists_cpu_buffer_local = dist_local[index_local[:,r],r]; indexes_cpu_buffer_local = index_local[:,r] index_local2 = np.argsort(dists_cpu_buffer_local,axis=0) dists[ii+r,:] = dists_cpu_buffer_local[index_local2] indexes[ii+r,:] = indexes_cpu_buffer_local[index_local2] #print ii+r,indexes_cpu_buffer_local[index_local2[0]] else: for r in xrange(stop_point-ii): dists_cpu_buffer_local = dist_local[:,r]; index_local2 = np.argsort(dists_cpu_buffer_local,axis=0) dists[ii+r,:] = dists_cpu_buffer_local[index_local2] indexes[ii+r,:] = index_local2
def frameAnalysis(): image = np.zeros((4000, 4000), np.uint8) for index in range(len(pointAngleArray)): theta = pointAngleArray[index] rho = pointDistanceArray[index] x, y = pol2cart(rho, math.radians(theta)) if config.debug: print x, y if -2000 < x < 2000 and -2000 < y < 2000: image[2000 + x][2000 + y] = 200 accumulator, thetas, rhos = hough_line(image) arr = np.ravel(accumulator) sortedArr = bn.argpartsort(arr, n=arr.shape[0] - config.maxLines) possibleLines = sortedArr[-config.maxLines :] lines = [] for x in range(len(possibleLines)): index = possibleLines[x] rho = rhos[index / accumulator.shape[1]] theta = thetas[index % accumulator.shape[1]] for i in range(x + 1, len(possibleLines)): index2 = possibleLines[i] rho2 = rhos[index2 / accumulator.shape[1]] theta2 = thetas[index2 % accumulator.shape[1]] # print "Theta1: " + repr(theta) + " Theta2: " + repr(theta2) + " Rho1: " + repr(rho) + " Rho2: " + repr(rho2) if abs(theta - theta2) < 0.04 and ((rho > 0) == (rho2 > 0)): # print "Merge suceeded" arr[index] += arr[index2] arr[index2] = 0 theta = (theta + theta2) / 2.0 rho = (rho + rho2) / 2 # else: # print "Merge failed" if arr[index] >= config.minLength: print "index:" print arr[index] lines.append([arr[index], theta, rho]) print "rho={0:.2f}, theta={1:.0f}".format(rho, np.rad2deg(theta)) cv2.imwrite("houghlines3.jpg", image)
def frameAnalysis(): image = np.zeros((4000, 4000), np.uint8) for index in range(len(pointAngleArray)): theta = pointAngleArray[index] rho = pointDistanceArray[index] x, y = pol2cart(rho, math.radians(theta)) if config.debug: print x, y if -2000 < x < 2000 and -2000 < y < 2000: image[2000 + x][2000 + y] = 200 accumulator, thetas, rhos = hough_line(image) arr = np.ravel(accumulator) sortedArr = bn.argpartsort(arr, n=arr.shape[0] - config.maxLines) possibleLines = sortedArr[-config.maxLines:] lines = [] for x in range(len(possibleLines)): index = possibleLines[x] rho = rhos[index / accumulator.shape[1]] theta = thetas[index % accumulator.shape[1]] for i in range(x + 1, len(possibleLines)): index2 = possibleLines[i] rho2 = rhos[index2 / accumulator.shape[1]] theta2 = thetas[index2 % accumulator.shape[1]] #print "Theta1: " + repr(theta) + " Theta2: " + repr(theta2) + " Rho1: " + repr(rho) + " Rho2: " + repr(rho2) if abs(theta - theta2) < .04 and ((rho > 0) == (rho2 > 0)): #print "Merge suceeded" arr[index] += arr[index2] arr[index2] = 0 theta = (theta + theta2) / 2. rho = (rho + rho2) / 2 #else: # print "Merge failed" if arr[index] >= config.minLength: print "index:" print arr[index] lines.append([arr[index], theta, rho]) print "rho={0:.2f}, theta={1:.0f}".format(rho, np.rad2deg(theta)) cv2.imwrite('houghlines3.jpg', image)
def argpartition(a, kth, axis=-1): return bottleneck.argpartsort(a, kth, axis)
_step = 0 _num += 1 if _num > 1: break batch_xs, batch_ys = get_next_batch_rnn(_step) Pred, _ = sess.run([pred, train_op], feed_dict={ _x: batch_xs, _y: batch_ys }) loss = sess.run(cost, feed_dict={_x: batch_xs, _y: batch_ys}) test_index_max3 = np.zeros([size, out_times]) batch_test_xs, batch_test_ys = random_get_batch() Pred_test = sess.run(tf.nn.softmax(pred), feed_dict={_x: batch_test_xs}) test_index = bottleneck.argpartsort(-batch_test_ys, 1, axis=1)[:, :1] test_index_max3[:][:] = test_index Pred_index_max3 = bottleneck.argpartsort(-Pred_test, out_times, axis=1)[:, :out_times] test_acc = np.amax(1 * np.equal(Pred_index_max3, test_index_max3), axis=1) test_acc = 1.0 * sum(test_acc) / len(test_acc) result = np.concatenate((test_index, Pred_test), axis=1) text = "_num:{0} _step:{1} _loss:{2} _accuracy:{3}".format( _num, _step, loss, test_acc) # text = "_num:{0} _step:{1} _loss:{2} _accuracy:{3} _Pred_test:{4}".format(_num, _step, loss, test_acc, result) print(text) if test_acc >= acc: saver.save(sess, save_path=mkdir(acc), global_step=_step) acc += 0.02
def get_topics(files_array=[], n_clusters=100): if not len(files_array): return start_time = time.time() print 'Time at start: %.3f' % (time.time() - start_time) sys.stdout.flush() train_texts = get_training_text() print 'Time after getting train text from XML: %.3f' % (time.time() - start_time) sys.stdout.flush() vectorizer = CountVectorizer(max_df=0.5) train_mat_init = vectorizer.fit_transform(train_texts) del train_texts vocab = vectorizer.vocabulary_ vocab_rev = {v: k for k, v in vocab.items()} print 'Time after CountVectorizer on train data: %.3f' % (time.time() - start_time) sys.stdout.flush() tfidf_fit = TfidfTransformer().fit(train_mat_init) train_mat = tfidf_fit.transform(train_mat_init) del train_mat_init print 'Time after TFIDF on train data: %.3f' % (time.time() - start_time) sys.stdout.flush() NMF_fit = NMF(n_components=n_clusters, init='nndsvda').fit(train_mat) del train_mat H = NMF_fit.components_ #W = NMF_fit.transform(train_mat) num_best = 50 best_indices = map( lambda v: list(bn.argpartsort(-v, num_best)[0:num_best]), H) for i in range(len(best_indices)): best_indices[i].sort(key=lambda j: -H[i, j]) best_words = [[vocab_rev[i] for i in lst] for lst in best_indices] print 'Time after NMF fit: %.3f\n' % (time.time() - start_time) if best_words_filename is not None: with open(best_words_filename, 'wb') as best_words_file: for c, lst in enumerate(best_words): best_words_file.write( str(c) + ' [' + ', '.join(map(lambda s: '\'' + s + '\'', lst)) + ']\n') else: print 'BEST WORDS FOR EACH CLUSTER:' for c, lst in enumerate(best_words): print '%d' % c, lst sys.stdout.flush() print '\nTime after NMF output: %.3f' % (time.time() - start_time) sys.stdout.flush() test_data = map(get_data, files_array) test_texts = [t for f, y, j, t in test_data] test_mat_init = vectorizer.transform(test_texts) del test_texts test_mat = tfidf_fit.transform(test_mat_init) del test_mat_init test_W = NMF_fit.transform(test_mat) del test_mat test_clusters = map(np.argmax, test_W) print 'Time after NMF test transform: %.3f\n' % (time.time() - start_time) print 'NUMBER OF CASES PER CLUSTER:' cluster_sizes = [ np.sum(np.array(test_clusters) == c) for c in range(n_clusters) ] for c, sz in enumerate(cluster_sizes): print '%d: %d' % (c, sz) print sys.stdout.flush() results = zip(test_clusters, test_data) results.sort( key=lambda (c, (f, y, j, t)): 2000 * c + y.year) # sort by cluster, then by year with open(cluster_output_filename, 'ab') as output_file: writer = csv.writer(output_file) for c, (f, y, j, t) in results: writer.writerow((f, y, j, c)) print '\nTime after all remaining output: %.3f\n' % (time.time() - start_time) cluster_weights = zip(files_array, test_W) n_cases = 20 if best_match_cases is not None: with open(best_match_cases, 'ab') as best_matches_file: for cluster_id in range(n_clusters): best_matches_file.write('FOR CLUSTER %d:\n' % cluster_id) cluster_weights.sort( key=lambda (case, weights): -weights[cluster_id]) clusters_ranked = map(lambda (c, w): c, cluster_weights[0:n_cases]) for case in clusters_ranked: best_matches_file.write(case + '\n') best_matches_file.write('\n')