def __call__(self, X, n = 10, x0 = None): ''' Parameters ---------- n : required number of start points, if None, defaults to 10 start points x0 : 2-dimensional numpy.array containing #rows equal to number of explicitly defined start points and #columns equal to dimension of the feature space points X : 2-dimensional numpy.array containing #rows equal to number of data points and #columns equal to dimension of the data points ''' self._Xi = X if n is None or n == 0: n = 10 if x0 is not None: num_x0_pts = x0.shape[0] else: return self._Xi[random_integers(0, self._Xi.shape[0] - 1, n),:] if num_x0_pts == n: return x0 elif num_x0_pts < n: return vstack((x0,self._Xi[random_integers(0, self._Xi.shape[0] - 1, n - num_x0_pts),:])) else: #num_x0_pts > n return x0[sample(xrange(0,num_x0_pts), n),:]
def twoDisjointLinesWithMSClustering(): t = arange(-1,1,0.002) x = map(lambda x: x + gauss(0,0.02)*(1-x*x), t) y = map(lambda x: x + gauss(0,0.02)*(1-x*x), t) z = map(lambda x: x + gauss(0,0.02)*(1-x*x), t) line1 = array(zip(x,y,z)) line = vstack((line1, line1 + 3)) lpc = LPCImpl(start_points_generator = lpcMeanShift(ms_h = 1), h = 0.05, mult = None, it = 200, cross = False, scaled = False, convergence_at = 0.001) lpc_curve = lpc.lpc(X=line) #Plot results fig = plt.figure() ax = Axes3D(fig) labels = lpc._startPointsGenerator._meanShift.labels_ labels_unique = unique(labels) cluster_centers = lpc._startPointsGenerator._meanShift.cluster_centers_ n_clusters = len(labels_unique) colors = cycle('bgrcmyk') for k, col in zip(range(n_clusters), colors): cluster_members = labels == k cluster_center = cluster_centers[k] ax.scatter(line[cluster_members, 0], line[cluster_members, 1], line[cluster_members, 2], c = col, alpha = 0.1) ax.scatter([cluster_center[0]], [cluster_center[1]], [cluster_center[2]], c = 'b', marker= '^') curve = lpc_curve[k]['save_xd'] ax.plot(curve[:,0],curve[:,1],curve[:,2], c = col, linewidth = 3) plt.show()
def test_predict(self): # define some easy training data and predict predictive distribution circle1 = Ring(variance=1, radius=3) circle2 = Ring(variance=1, radius=10) n = 100 X = circle1.sample(n / 2).samples X = vstack((X, circle2.sample(n / 2).samples)) y = ones(n) y[:n / 2] = -1.0 # plot(X[:n/2,0], X[:n/2,1], 'ro') # hold(True) # plot(X[n/2:,0], X[n/2:,1], 'bo') # hold(False) # show() covariance = SquaredExponentialCovariance(1, 1) likelihood = LogitLikelihood() gp = GaussianProcess(y, X, covariance, likelihood) # predict on mesh n_test = 20 P = linspace(X[:, 0].min() - 1, X[:, 1].max() + 1, n_test) Q = linspace(X[:, 1].min() - 1, X[:, 1].max() + 1, n_test) X_test = asarray(list(itertools.product(P, Q))) # Y_test = exp(LaplaceApproximation(gp).predict(X_test).reshape(n_test, n_test)) Y_train = exp(LaplaceApproximation(gp).predict(X)) print Y_train print Y_train>0.5 print y
def test_stacking_arrays(self): a = array([[1,2,3,4],[5,6,7,8],[9,10,11,12]]) b = array([[13,14,15,16],[17,18,19,20],[21,22,23,24]]) c = vstack((a,b)) numpy.testing.assert_array_equal(c, array([[1,2,3,4],[5,6,7,8],[9,10,11,12],[13,14,15,16],[17,18,19,20],[21,22,23,24]])) d = hstack((a,b)) numpy.testing.assert_array_equal(d, array([[1,2,3,4,13,14,15,16], [5,6,7,8,17,18,19,20], [9,10,11,12,21,22,23,24]]))
def next(self): if self._curFold < self._numFolds-1: if self._testData is not None: self._trainData.insert(self._curFold, self._testData) self._trainLabels.insert(self._curFold, self._testLabels) self._curFold += 1 self._testData = self._trainData.pop(self._curFold) self._testLabels = self._trainLabels.pop(self._curFold) ret = (vstack(self._trainData), hstack(self._trainLabels), self._testData, self._testLabels) #print self._curFold, self._numFolds, [x.shape for x in ret] return ret else: raise StopIteration()
def crossValidation(numFolds, data, labels, algorithm, accuracyList, learningCurveList, numLearningCurveIterations, learningCurveIndexMod): dataFolds = array_split(data, numFolds) labelFolds = array_split(labels, numFolds) for testIndex in range(numFolds): print testIndex, testData = dataFolds.pop(testIndex) testLabels = labelFolds.pop(testIndex) trainData = vstack(dataFolds) trainLabels = hstack(labelFolds) accuracyList.append(algorithm(trainData, trainLabels, testData, testLabels)) learningCurve(algorithm, learningCurveList, trainData, trainLabels, testData, testLabels, numLearningCurveIterations, learningCurveIndexMod) dataFolds.insert(testIndex, testData) labelFolds.insert(testIndex, testLabels) print ''
def setUp(self): x = arange(0,1,0.1) y = arange(0,1,0.1) z = arange(0,1,0.1) self._line = array(zip(x,y,z)) t = hstack((arange(-1,-0.1,0.005), arange(1, 2, 0.005))) x = map(lambda x: x + gauss(0,0.005), t) y = map(lambda x: x + gauss(0,0.005), t) z = map(lambda x: x + gauss(0,0.005), t) self._line_cluster = array(zip(x,y,z)) t = arange(-1,1,0.001) x = map(lambda x: x + gauss(0,0.02)*(1-x*x), t) y = map(lambda x: x + gauss(0,0.02)*(1-x*x), t) z = map(lambda x: x + gauss(0,0.02)*(1-x*x), t) line = array(zip(x,y,z)) self._line_cluster_2_ = vstack((line, line + 3))
def _followx( self, x, way = 'one', last_eigenvector = None, weights = 1.): '''Generates a single lpc curve, from the start point, x. Proceeds in forward ('one'), backward ('back') or both ('two') directions from this point. Parameters ---------- x : 1-dim numpy.array of floats containing the start point for the lpc algorithm way : one of 'one'/'back'/'two', defines the orientation of the lpc propagation last_eigenvector: see _followXSingleDirection weights: see _followXSingleDirection Returns ------- curve : a dictionary comprising a single lpc curve in m-dim feature space, with keys, values as self._followxSingleDirection with the addition of start_point, 1-dim numpy.array of floats of length m; start_point_index, index of start_point in save_xd; For way == 'two', the forward and backward curves are stitched together. save_xd, eigen_vecd, cos_neu_neu, rho and c0 are formed by concatenating the reversed 'back' curve (with start_point removed) with the 'one' curve. high_rho_points are the union of forward and backward high_rho_points. lamb is the cumulative segment distance along the stitched together save_xd with, as before, lamb[0] = 0.0. TODO, should farm this out to an 'lpcCurve'-type class that knows how to join its instances ''' if way == 'one': curve = self._followxSingleDirection( x, direction = Direction.FORWARD, last_eigenvector = last_eigenvector, weights = weights) curve['start_point'] = x curve['start_point_index'] = 0 return curve elif way == 'back': curve = self._followxSingleDirection( x, direction = Direction.BACK, last_eigenvector = last_eigenvector, weights = weights) curve['start_point'] = x curve['start_point_index'] = 0 return curve elif way == 'two': forward_curve = self._followxSingleDirection( x, direction = Direction.FORWARD, last_eigenvector = last_eigenvector, weights = weights) back_curve = self._followxSingleDirection( x, direction = Direction.BACK, forward_curve = forward_curve, last_eigenvector = last_eigenvector, weights = weights) #Stitching - append forward_curve to the end of the reversed back_curve with initial point of back curve removed #TODO - neaten this up, looks pretty clumsy combined_distance = hstack((-back_curve['lamb'][:0:-1], forward_curve['lamb'])) curve = {'start_point': x, 'start_point_index': len(back_curve['save_xd']) - 1, 'save_xd': vstack((back_curve['save_xd'][:0:-1], forward_curve['save_xd'])), 'eigen_vecd': vstack((back_curve['eigen_vecd'][:0:-1], forward_curve['eigen_vecd'])), 'cos_neu_neu': hstack((back_curve['cos_neu_neu'][:0:-1], forward_curve['cos_neu_neu'])), 'rho': hstack((back_curve['rho'][:0:-1], forward_curve['rho'])), 'high_rho_points': vstack((back_curve['high_rho_points'], forward_curve['high_rho_points'])), 'lamb': combined_distance - min(combined_distance), 'c0': hstack((back_curve['c0'][:0:-1], forward_curve['c0'])), } return curve else: raise ValueError, 'way must be one of one/back/two'
def _followxSingleDirection( self, x, direction = Direction.FORWARD, forward_curve = None, last_eigenvector = None, weights = 1.): '''Generates a partial lpc curve dictionary from the start point, x. Arguments --------- x : 1-dim, length m, numpy.array of floats, start point for the algorithm when m is dimension of feature space direction : bool, proceeds in Direction.FORWARD or Direction.BACKWARD from this point (just sets sign for first eigenvalue) forward_curve : dictionary as returned by this function, is used to detect crossing of the curve under construction with a previously constructed curve last_eigenvector : 1-dim, length m, numpy.array of floats, a unit vector that defines the initial direction, relative to which the first eigenvector is biased and initial cos_neu_neu is calculated weights : 1-dim, length n numpy.array of observation weights (can also be used to exclude individual observations from the computation by setting their weight to zero.), where n is the number of feature points ''' x0 = copy(x) N = self.Xi.shape[0] d = self.Xi.shape[1] it = self._lpcParameters['it'] h = array(self._lpcParameters['h']) t0 = self._lpcParameters['t0'] rho0 = self._lpcParameters['rho0'] save_xd = empty((it,d)) eigen_vecd = empty((it,d)) c0 = ones(it) cos_alt_neu = ones(it) cos_neu_neu = ones(it) lamb = empty(it) #NOTE this is named 'lambda' in the original R code rho = zeros(it) high_rho_points = empty((0,d)) count_points = 0 for i in range(it): kernel_weights = self._kernd(self.Xi, x0, c0[i]*h) * weights mu_x = average(self.Xi, axis = 0, weights = kernel_weights) sum_weights = sum(kernel_weights) mean_sub = self.Xi - mu_x cov_x = dot( dot(transpose(mean_sub), numpy.diag(kernel_weights)), mean_sub) / sum_weights #assert (abs(cov_x.transpose() - cov_x)/abs(cov_x.transpose() + cov_x) < 1e-6).all(), 'Covariance matrix not symmetric, \n cov_x = {0}, mean_sub = {1}'.format(cov_x, mean_sub) save_xd[i] = mu_x #save first point of the branch count_points += 1 #calculate path length if i==0: lamb[0] = 0 else: lamb[i] = lamb[i-1] + sqrt(sum((mu_x - save_xd[i-1])**2)) #calculate eigenvalues/vectors #(sorted_eigen_cov is a list of tuples containing eigenvalue and associated eigenvector, sorted descending by eigenvalue) eigen_cov = eigh(cov_x) sorted_eigen_cov = zip(eigen_cov[0],map(ravel,vsplit(eigen_cov[1].transpose(),len(eigen_cov[1])))) sorted_eigen_cov.sort(key = lambda elt: elt[0], reverse = True) eigen_norm = sqrt(sum(sorted_eigen_cov[0][1]**2)) eigen_vecd[i] = direction * sorted_eigen_cov[0][1] / eigen_norm #Unit eigenvector corresponding to largest eigenvalue #rho parameters rho[i] = sorted_eigen_cov[1][0] / sorted_eigen_cov[0][0] #Ratio of two largest eigenvalues if i != 0 and rho[i] > rho0 and rho[i-1] <= rho0: high_rho_points = vstack((high_rho_points, x0)) #angle between successive eigenvectors if i==0 and last_eigenvector is not None: cos_alt_neu[i] = direction * dot(last_eigenvector, eigen_vecd[i]) if i > 0: cos_alt_neu[i] = dot(eigen_vecd[i], eigen_vecd[i-1]) #signum flipping if cos_alt_neu[i] < 0: eigen_vecd[i] = -eigen_vecd[i] cos_neu_neu[i] = -cos_alt_neu[i] else: cos_neu_neu[i] = cos_alt_neu[i] #angle penalization pen = self._lpcParameters['pen'] if pen > 0: if i == 0 and last_eigenvector is not None: a = abs(cos_alt_neu[i])**pen eigen_vecd[i] = a * eigen_vecd[i] + (1-a) * last_eigenvector if i > 0: a = abs(cos_alt_neu[i])**pen eigen_vecd[i] = a * eigen_vecd[i] + (1-a) * eigen_vecd[i-1] #check curve termination criteria if i not in (0, it-1): #crossing cross = self._lpcParameters['cross'] if forward_curve is None: full_curve_points = save_xd[0:i+1] else: full_curve_points = vstack((forward_curve['save_xd'],save_xd[0:i+1])) #inefficient, initialize then append? if not cross: prox = where(ravel(cdist(full_curve_points,[mu_x])) <= mean(h))[0] if len(prox) != max(prox) - min(prox) + 1: break #convergence convergence_at = self._lpcParameters['convergence_at'] conv_ratio = abs(lamb[i] - lamb[i-1]) / (2 * (lamb[i] + lamb[i-1])) if conv_ratio < convergence_at: break #boundary boundary = self._lpcParameters['boundary'] if conv_ratio < boundary: c0[i+1] = 0.995 * c0[i] else: c0[i+1] = min(1.01*c0[i], 1) #step along in direction eigen_vecd[i] x0 = mu_x + t0 * eigen_vecd[i] #trim output in the case where convergence occurs before 'it' iterations curve = { 'save_xd': save_xd[0:count_points], 'eigen_vecd': eigen_vecd[0:count_points], 'cos_neu_neu': cos_neu_neu[0:count_points], 'rho': rho[0:count_points], 'high_rho_points': high_rho_points, 'lamb': lamb[0:count_points], 'c0': c0[0:count_points] } return curve
def test_column_stack_and_vstack(self): a=array([4.,2.]) b=array([2.,8.]) numpy.testing.assert_array_equal(column_stack((a[:,newaxis],b[:newaxis])), array([[4.,2.],[2.,8.]])) numpy.testing.assert_array_equal(vstack((a[:,newaxis],b[:,newaxis])), array([[4.],[2.],[2.],[8.]]))
def incomplete_cholesky(X, kernel, eta, power=1, blocksize=100): """ Computes the incomplete Cholesky factorisation of the kernel matrix defined by samples X and a given kernel. The kernel is evaluated on-the-fly. The optional power parameter is used to multiply the kernel output with itself. Original code from "Kernel Methods for Pattern Analysis" by Shawe-Taylor and Cristianini. Modified to compute kernel on the fly, to use kernels multiplied with themselves (tensor product), and optimised speed via using vector operations and not pre-allocate full kernel matrix memory, but rather allocate memory of low-rank kernel block-wise Changes by Heiko Strathmann parameters: X - list of input vectors to evaluate kernel on kernel - a kernel object with a kernel method that takes 2d-arrays and returns a psd kernel matrix eta - precision cutoff parameter for the low-rank approximation. Lies is (0,1) where smaller means more accurate. power - every kernel evaluation is multiplied with itself this number of times. Zero is supported blocksize - tuning parameter for speed, determines how rows elements are allocated in a block for the (growing) kernel matrix. Larger means faster algorithm (to some extend if low rank dimension is larger than blocksize) output: K_chol, ell, I, R, W, where K - is the kernel using only the pivot index features I - is a list containing the pivots used to compute K_chol R - is a low-rank factor such that R.T.dot(R) approximates the original K W - is a matrix such that W.T.dot(K_chol.dot(W)) approximates the original K """ assert(eta>0 and eta<1) assert(power>=0) assert(blocksize>=0) assert(len(X)>=0) m=len(X) # growing low rank basis R=zeros((blocksize,m)) # diagonal (assumed to be one) d=ones(m) # used indices I=[] nu=[] # algorithm is executed as long as a is bigger than eta precision a=d.max() I.append(d.argmax()) # growing set of evaluated kernel values K=zeros((blocksize,m)) j=0 while a>eta: nu.append(sqrt(a)) if power>=1: K[j,:]=kernel.kernel([X[I[j]]], X)**power else: K[j,:]=ones(m) if j==0: R_dot_j=0 elif j==1: R_dot_j=R[:j,:]*R[:j,I[j]] else: R_dot_j=R[:j,:].T.dot(R[:j,I[j]]) R[j,:]=(K[j,:] - R_dot_j)/nu[j] d=d-R[j,:]**2 a=d.max() I.append(d.argmax()) j=j+1 # allocate more space for kernel if j>=len(K): K=vstack((K, zeros((blocksize,m)))) R=vstack((R, zeros((blocksize,m)))) # remove un-used rows which were located unnecessarily K=K[:j,:] R=R[:j,:] # remove list pivot index since it is not used I=I[:-1] # from low rank to full rank W=solve(R[:,I], R) # low rank K K_chol=K[:,I] return K_chol, I, R, W
plt.legend() plt.title('Clusters') plt.show() 'Creates an excel file with the ticker and the cluster asigned' details = [(name,cluster) for name, cluster in zip(returns.index,idx)] Det = pd.DataFrame(details) Det.columns = ['Ticker','Cluster'] Det.to_excel("Ticker_Cluster.xlsx") 'Plot Hierarchical Clustering Dendrogram' plt.figure(figsize=(15, 15)) plt.title('Hierarchical Clustering Dendrogram') plt.xlabel('ETFs Countries') plt.ylabel('Distance') dendrogram(linkage(vstack([Cluster_points['Vol'],Cluster_points['Ret']]).T, 'ward'), orientation='left', leaf_rotation=0., leaf_font_size=16., labels=returns.index ) plt.tight_layout() plt.show() # References: 'https://www.pythonforfinance.net/2018/02/08/stock-clusters-using-k-means-algorithm-in-python/' 'https://nikkimarinsek.com/blog/7-ways-to-label-a-cluster-plot-python'