示例#1
0
 def __call__(self, X, n = 10, x0 = None):
   '''
   Parameters
   ----------
   n : required number of start points, if  None, defaults to 10 start points
   
   x0 : 2-dimensional numpy.array containing #rows equal to number of explicitly defined start points and #columns equal to 
   dimension of the feature space points
   
   X : 2-dimensional numpy.array containing #rows equal to number of data points and #columns equal to dimension 
   of the data points
   '''
   self._Xi = X
   if n is None or n == 0:
     n = 10
   if x0 is not None:
     num_x0_pts = x0.shape[0]
   else:
     return self._Xi[random_integers(0, self._Xi.shape[0] - 1, n),:]
   if num_x0_pts == n:
     return x0
   elif num_x0_pts < n:
     return vstack((x0,self._Xi[random_integers(0, self._Xi.shape[0] - 1, n - num_x0_pts),:]))
   else: #num_x0_pts > n
     return x0[sample(xrange(0,num_x0_pts), n),:]
示例#2
0
def twoDisjointLinesWithMSClustering():
 
  t = arange(-1,1,0.002)
  x = map(lambda x: x + gauss(0,0.02)*(1-x*x), t)
  y = map(lambda x: x + gauss(0,0.02)*(1-x*x), t)
  z = map(lambda x: x + gauss(0,0.02)*(1-x*x), t)
  line1 = array(zip(x,y,z))
  line = vstack((line1, line1 + 3))
  lpc = LPCImpl(start_points_generator = lpcMeanShift(ms_h = 1), h = 0.05, mult = None, it = 200, cross = False, scaled = False, convergence_at = 0.001)
  lpc_curve = lpc.lpc(X=line)
  #Plot results
  fig = plt.figure()
  ax = Axes3D(fig)
  labels = lpc._startPointsGenerator._meanShift.labels_
  labels_unique = unique(labels)
  cluster_centers = lpc._startPointsGenerator._meanShift.cluster_centers_
  n_clusters = len(labels_unique)
  colors = cycle('bgrcmyk')
  for k, col in zip(range(n_clusters), colors):
    cluster_members = labels == k
    cluster_center = cluster_centers[k]
    ax.scatter(line[cluster_members, 0], line[cluster_members, 1], line[cluster_members, 2], c = col, alpha = 0.1)
    ax.scatter([cluster_center[0]], [cluster_center[1]], [cluster_center[2]], c = 'b', marker= '^')
    curve = lpc_curve[k]['save_xd']
    ax.plot(curve[:,0],curve[:,1],curve[:,2], c = col, linewidth = 3)
  plt.show()
示例#3
0
    def test_predict(self):
        # define some easy training data and predict predictive distribution
        circle1 = Ring(variance=1, radius=3)
        circle2 = Ring(variance=1, radius=10)
        
        n = 100
        X = circle1.sample(n / 2).samples
        X = vstack((X, circle2.sample(n / 2).samples))
        y = ones(n)
        y[:n / 2] = -1.0
        
#        plot(X[:n/2,0], X[:n/2,1], 'ro')
#        hold(True)
#        plot(X[n/2:,0], X[n/2:,1], 'bo')
#        hold(False)
#        show()

        covariance = SquaredExponentialCovariance(1, 1)
        likelihood = LogitLikelihood()
        gp = GaussianProcess(y, X, covariance, likelihood)

        # predict on mesh
        n_test = 20
        P = linspace(X[:, 0].min() - 1, X[:, 1].max() + 1, n_test)
        Q = linspace(X[:, 1].min() - 1, X[:, 1].max() + 1, n_test)
        X_test = asarray(list(itertools.product(P, Q)))
#        Y_test = exp(LaplaceApproximation(gp).predict(X_test).reshape(n_test, n_test))
        Y_train = exp(LaplaceApproximation(gp).predict(X))
        print Y_train
        
        print Y_train>0.5
        print y
示例#4
0
 def test_stacking_arrays(self):
     a = array([[1,2,3,4],[5,6,7,8],[9,10,11,12]])
     b = array([[13,14,15,16],[17,18,19,20],[21,22,23,24]])
     c = vstack((a,b))
     numpy.testing.assert_array_equal(c, array([[1,2,3,4],[5,6,7,8],[9,10,11,12],[13,14,15,16],[17,18,19,20],[21,22,23,24]]))
     d = hstack((a,b))
     numpy.testing.assert_array_equal(d, array([[1,2,3,4,13,14,15,16],
                                                [5,6,7,8,17,18,19,20],
                                                [9,10,11,12,21,22,23,24]]))
示例#5
0
 def next(self):
     if self._curFold < self._numFolds-1:
         if self._testData is not None:
             self._trainData.insert(self._curFold, self._testData)
             self._trainLabels.insert(self._curFold, self._testLabels)
             self._curFold += 1
         self._testData = self._trainData.pop(self._curFold)
         self._testLabels = self._trainLabels.pop(self._curFold)      
         ret = (vstack(self._trainData), hstack(self._trainLabels), self._testData, self._testLabels)
         #print self._curFold, self._numFolds, [x.shape for x in ret]
         return ret
     else:
         raise StopIteration()
示例#6
0
def crossValidation(numFolds, data, labels, algorithm, accuracyList, learningCurveList, numLearningCurveIterations, learningCurveIndexMod):
    dataFolds = array_split(data, numFolds)
    labelFolds = array_split(labels, numFolds)
    for testIndex in range(numFolds):
        print testIndex,
        testData = dataFolds.pop(testIndex)
        testLabels = labelFolds.pop(testIndex)
        trainData = vstack(dataFolds)
        trainLabels = hstack(labelFolds)
        accuracyList.append(algorithm(trainData, trainLabels, testData, testLabels))
        learningCurve(algorithm, learningCurveList, trainData, trainLabels, testData, testLabels, numLearningCurveIterations, learningCurveIndexMod)
        dataFolds.insert(testIndex, testData)
        labelFolds.insert(testIndex, testLabels)
    print ''
示例#7
0
 def setUp(self):
   x = arange(0,1,0.1)
   y = arange(0,1,0.1)
   z = arange(0,1,0.1)
   self._line = array(zip(x,y,z))
   
   t = hstack((arange(-1,-0.1,0.005), arange(1, 2, 0.005)))
   x = map(lambda x: x + gauss(0,0.005), t)
   y = map(lambda x: x + gauss(0,0.005), t)
   z = map(lambda x: x + gauss(0,0.005), t)
   self._line_cluster = array(zip(x,y,z))
   
   t = arange(-1,1,0.001)
   x = map(lambda x: x + gauss(0,0.02)*(1-x*x), t)
   y = map(lambda x: x + gauss(0,0.02)*(1-x*x), t)
   z = map(lambda x: x + gauss(0,0.02)*(1-x*x), t)
   line = array(zip(x,y,z))
   self._line_cluster_2_ = vstack((line, line + 3))
示例#8
0
文件: lpc.py 项目: epp-warwick/lpcm
 def _followx( self, x, way = 'one', last_eigenvector = None, weights = 1.):
   '''Generates a single lpc curve, from the start point, x. Proceeds in forward ('one'), backward ('back') or both ('two') 
   directions from this point.  
   
   Parameters
   ----------
   x : 1-dim numpy.array of floats containing the start point for the lpc algorithm
   way : one of 'one'/'back'/'two', defines the orientation of the lpc propagation
   last_eigenvector: see _followXSingleDirection
   weights: see _followXSingleDirection
   
   Returns
   -------
   curve : a dictionary comprising a single lpc curve in m-dim feature space, with keys, values as self._followxSingleDirection
   with the addition of
             
     start_point, 1-dim numpy.array of floats of length m;
     start_point_index, index of start_point in save_xd;
   
     For way == 'two', the forward and backward curves are stitched together. save_xd, eigen_vecd, cos_neu_neu, rho and c0 are formed
     by concatenating the reversed 'back' curve (with start_point removed) with the 'one' curve. high_rho_points are the union of 
     forward and backward high_rho_points. lamb is the cumulative segment distance along the stitched together save_xd with, as before,
     lamb[0] = 0.0. TODO, should farm this out to an 'lpcCurve'-type class that knows how to join its instances          
   '''
   if way == 'one':
     curve = self._followxSingleDirection(
                                 x, 
                                 direction = Direction.FORWARD,
                                 last_eigenvector = last_eigenvector,
                                 weights = weights)
     curve['start_point'] = x
     curve['start_point_index'] = 0
     return curve
   elif way == 'back':
     curve = self._followxSingleDirection(
                                 x,
                                 direction = Direction.BACK,
                                 last_eigenvector = last_eigenvector,
                                 weights = weights)
     curve['start_point'] = x
     curve['start_point_index'] = 0
     return curve
   elif way == 'two':
     forward_curve =  self._followxSingleDirection(
                                 x, 
                                 direction = Direction.FORWARD,
                                 last_eigenvector = last_eigenvector,
                                 weights = weights)
     back_curve =  self._followxSingleDirection(
                                 x,
                                 direction = Direction.BACK,
                                 forward_curve = forward_curve,
                                 last_eigenvector = last_eigenvector,
                                 weights = weights)
     #Stitching - append forward_curve to the end of the reversed back_curve with initial point of back curve removed
     #TODO - neaten this up, looks pretty clumsy
     combined_distance = hstack((-back_curve['lamb'][:0:-1], forward_curve['lamb'])) 
     curve = {'start_point': x,
              'start_point_index': len(back_curve['save_xd']) - 1,
              'save_xd': vstack((back_curve['save_xd'][:0:-1], forward_curve['save_xd'])),
              'eigen_vecd': vstack((back_curve['eigen_vecd'][:0:-1], forward_curve['eigen_vecd'])),
              'cos_neu_neu': hstack((back_curve['cos_neu_neu'][:0:-1], forward_curve['cos_neu_neu'])),
              'rho': hstack((back_curve['rho'][:0:-1], forward_curve['rho'])),
              'high_rho_points': vstack((back_curve['high_rho_points'], forward_curve['high_rho_points'])),
              'lamb': combined_distance - min(combined_distance),
              'c0': hstack((back_curve['c0'][:0:-1], forward_curve['c0'])),
              }      
     return curve
   else:
     raise ValueError, 'way must be one of one/back/two'
示例#9
0
文件: lpc.py 项目: epp-warwick/lpcm
 def _followxSingleDirection(  self, 
                               x, 
                               direction = Direction.FORWARD,
                               forward_curve = None,
                               last_eigenvector = None, 
                               weights = 1.):
   '''Generates a partial lpc curve dictionary from the start point, x.
   Arguments
   ---------
   x : 1-dim, length m, numpy.array of floats, start point for the algorithm when m is dimension of feature space
   
   direction :  bool, proceeds in Direction.FORWARD or Direction.BACKWARD from this point (just sets sign for first eigenvalue) 
   
   forward_curve : dictionary as returned by this function, is used to detect crossing of the curve under construction with a
       previously constructed curve
       
   last_eigenvector : 1-dim, length m, numpy.array of floats, a unit vector that defines the initial direction, relative to
       which the first eigenvector is biased and initial cos_neu_neu is calculated  
       
   weights : 1-dim, length n numpy.array of observation weights (can also be used to exclude
       individual observations from the computation by setting their weight to zero.),
       where n is the number of feature points 
   '''
   x0 = copy(x)
   N = self.Xi.shape[0]
   d = self.Xi.shape[1]
   it = self._lpcParameters['it']
   h = array(self._lpcParameters['h'])
   t0 = self._lpcParameters['t0']
   rho0 = self._lpcParameters['rho0']
   
   save_xd = empty((it,d))
   eigen_vecd = empty((it,d))
   c0 = ones(it)
   cos_alt_neu = ones(it)
   cos_neu_neu = ones(it)    
   lamb = empty(it) #NOTE this is named 'lambda' in the original R code
   rho = zeros(it)
   high_rho_points = empty((0,d))    
   count_points = 0
   
   for i in range(it):
     kernel_weights = self._kernd(self.Xi, x0, c0[i]*h) * weights
     mu_x = average(self.Xi, axis = 0, weights = kernel_weights)
     sum_weights = sum(kernel_weights)
     mean_sub = self.Xi - mu_x 
     cov_x = dot( dot(transpose(mean_sub), numpy.diag(kernel_weights)), mean_sub) / sum_weights 
     #assert (abs(cov_x.transpose() - cov_x)/abs(cov_x.transpose() + cov_x) < 1e-6).all(), 'Covariance matrix not symmetric, \n cov_x = {0}, mean_sub = {1}'.format(cov_x, mean_sub)
     save_xd[i] = mu_x #save first point of the branch
     count_points += 1
     
     #calculate path length
     if i==0:
       lamb[0] = 0
     else:
       lamb[i] = lamb[i-1] + sqrt(sum((mu_x - save_xd[i-1])**2))
     
     #calculate eigenvalues/vectors
     #(sorted_eigen_cov is a list of tuples containing eigenvalue and associated eigenvector, sorted descending by eigenvalue)
     eigen_cov = eigh(cov_x)
     sorted_eigen_cov = zip(eigen_cov[0],map(ravel,vsplit(eigen_cov[1].transpose(),len(eigen_cov[1]))))
     sorted_eigen_cov.sort(key = lambda elt: elt[0], reverse = True)   
     eigen_norm = sqrt(sum(sorted_eigen_cov[0][1]**2))
     eigen_vecd[i] = direction * sorted_eigen_cov[0][1] / eigen_norm  #Unit eigenvector corresponding to largest eigenvalue
     
     #rho parameters
     rho[i] = sorted_eigen_cov[1][0] / sorted_eigen_cov[0][0] #Ratio of two largest eigenvalues
     if i != 0 and rho[i] > rho0 and rho[i-1] <= rho0:
       high_rho_points = vstack((high_rho_points, x0))
     
     #angle between successive eigenvectors
     if i==0 and last_eigenvector is not None:
       cos_alt_neu[i] = direction * dot(last_eigenvector, eigen_vecd[i])
     if i > 0:
       cos_alt_neu[i] = dot(eigen_vecd[i], eigen_vecd[i-1])
     
     #signum flipping
     if cos_alt_neu[i] < 0:
       eigen_vecd[i] = -eigen_vecd[i]
       cos_neu_neu[i] = -cos_alt_neu[i]
     else:
       cos_neu_neu[i] = cos_alt_neu[i]
    
     #angle penalization
     pen = self._lpcParameters['pen']
     if pen > 0:
       if i == 0 and last_eigenvector is not None:
         a = abs(cos_alt_neu[i])**pen
         eigen_vecd[i] = a * eigen_vecd[i] + (1-a) * last_eigenvector
       if i > 0:
         a = abs(cos_alt_neu[i])**pen
         eigen_vecd[i] = a * eigen_vecd[i] + (1-a) * eigen_vecd[i-1]
             
     #check curve termination criteria
     if i not in (0, it-1):
       #crossing
       cross = self._lpcParameters['cross']
       if forward_curve is None:
         full_curve_points = save_xd[0:i+1]
       else:
         full_curve_points = vstack((forward_curve['save_xd'],save_xd[0:i+1])) #inefficient, initialize then append? 
       if not cross:
         prox = where(ravel(cdist(full_curve_points,[mu_x])) <= mean(h))[0]
         if len(prox) != max(prox) - min(prox) + 1:
           break
         
       #convergence
       convergence_at = self._lpcParameters['convergence_at']
       conv_ratio = abs(lamb[i] - lamb[i-1]) / (2 * (lamb[i] + lamb[i-1]))
       if conv_ratio  < convergence_at:
         break
       
       #boundary
       boundary = self._lpcParameters['boundary']
       if conv_ratio < boundary:
         c0[i+1] = 0.995 * c0[i]
       else:
         c0[i+1] = min(1.01*c0[i], 1)
     
     #step along in direction eigen_vecd[i]
     x0 = mu_x + t0 * eigen_vecd[i]
   
   #trim output in the case where convergence occurs before 'it' iterations    
   curve = { 'save_xd': save_xd[0:count_points],
             'eigen_vecd': eigen_vecd[0:count_points],
             'cos_neu_neu': cos_neu_neu[0:count_points],
             'rho': rho[0:count_points],
             'high_rho_points': high_rho_points,
             'lamb': lamb[0:count_points],
             'c0': c0[0:count_points]
           }
   return curve  
示例#10
0
 def test_column_stack_and_vstack(self):
     a=array([4.,2.])
     b=array([2.,8.])
     numpy.testing.assert_array_equal(column_stack((a[:,newaxis],b[:newaxis])), array([[4.,2.],[2.,8.]]))
     numpy.testing.assert_array_equal(vstack((a[:,newaxis],b[:,newaxis])), array([[4.],[2.],[2.],[8.]]))
示例#11
0
def incomplete_cholesky(X, kernel, eta, power=1, blocksize=100):
    """
    Computes the incomplete Cholesky factorisation of the kernel matrix defined
    by samples X and a given kernel. The kernel is evaluated on-the-fly.
    The optional power parameter is used to multiply the kernel output with
    itself.
    
    Original code from "Kernel Methods for Pattern Analysis" by Shawe-Taylor and
    Cristianini.
    Modified to compute kernel on the fly, to use kernels multiplied with 
    themselves (tensor product), and optimised speed via using vector
    operations and not pre-allocate full kernel matrix memory, but rather
    allocate memory of low-rank kernel block-wise
    Changes by Heiko Strathmann
    
    parameters:
    X         - list of input vectors to evaluate kernel on
    kernel    - a kernel object with a kernel method that takes 2d-arrays
                and returns a psd kernel matrix
    eta       - precision cutoff parameter for the low-rank approximation.
                Lies is (0,1) where smaller means more accurate.
    power     - every kernel evaluation is multiplied with itself this number
                of times. Zero is supported
    blocksize - tuning parameter for speed, determines how rows elements are
                allocated in a block for the (growing) kernel matrix. Larger
                means faster algorithm (to some extend if low rank dimension
                is larger than blocksize)
    
    output:
    K_chol, ell, I, R, W, where
    K    - is the kernel using only the pivot index features
    I    - is a list containing the pivots used to compute K_chol
    R    - is a low-rank factor such that R.T.dot(R) approximates the
           original K
    W    - is a matrix such that W.T.dot(K_chol.dot(W)) approximates the
           original K
    
    """
    assert(eta>0 and eta<1)
    assert(power>=0)
    assert(blocksize>=0)
    assert(len(X)>=0)
    
    m=len(X)

    # growing low rank basis
    R=zeros((blocksize,m))
    
    # diagonal (assumed to be one)
    d=ones(m)
    
    # used indices
    I=[]
    nu=[]
    
    # algorithm is executed as long as a is bigger than eta precision
    a=d.max()
    I.append(d.argmax())
    
    # growing set of evaluated kernel values
    K=zeros((blocksize,m))
    
    j=0
    while a>eta:
        nu.append(sqrt(a))
        
        if power>=1:
            K[j,:]=kernel.kernel([X[I[j]]], X)**power
        else:
            K[j,:]=ones(m)
            
        if j==0:
            R_dot_j=0
        elif j==1:
            R_dot_j=R[:j,:]*R[:j,I[j]]
        else:
            R_dot_j=R[:j,:].T.dot(R[:j,I[j]])
                        
        R[j,:]=(K[j,:] - R_dot_j)/nu[j]
        d=d-R[j,:]**2
        a=d.max()
        I.append(d.argmax())
        j=j+1
        
        # allocate more space for kernel
        if j>=len(K):
            K=vstack((K, zeros((blocksize,m))))
            R=vstack((R, zeros((blocksize,m))))
            
    # remove un-used rows which were located unnecessarily
    K=K[:j,:]
    R=R[:j,:]

    # remove list pivot index since it is not used
    I=I[:-1]
    
    # from low rank to full rank
    W=solve(R[:,I], R)
    
    # low rank K
    K_chol=K[:,I]
    
    return K_chol, I, R, W
plt.legend()
plt.title('Clusters')
plt.show()

'Creates an excel file with the ticker and the cluster asigned'
details = [(name,cluster) for name, cluster in zip(returns.index,idx)]
Det = pd.DataFrame(details)
Det.columns = ['Ticker','Cluster']
Det.to_excel("Ticker_Cluster.xlsx")

'Plot Hierarchical Clustering Dendrogram'
plt.figure(figsize=(15, 15))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('ETFs Countries')
plt.ylabel('Distance')
dendrogram(linkage(vstack([Cluster_points['Vol'],Cluster_points['Ret']]).T, 
                   'ward'),
            orientation='left',
            leaf_rotation=0.,
            leaf_font_size=16.,
            labels=returns.index )
plt.tight_layout()
plt.show()


# References:

'https://www.pythonforfinance.net/2018/02/08/stock-clusters-using-k-means-algorithm-in-python/'
'https://nikkimarinsek.com/blog/7-ways-to-label-a-cluster-plot-python'