def splitData(X,y,portion,seed): startLog(__name__) logger = logging.getLogger(__name__) records = {'portion':portion, 'seed':seed} logger.info('split data into train and test %s',records) if not isinstance(X, np.ndarray): logger.debug('X is not a nparray, converted') X = np.array(X) y = np.array(y) (rows, cols) = np.shape(X) size_data = rows index = np.random.permutation(size_data) Ntr = math.floor(portion*size_data) idx_test = index[0:Ntr] idx_train = index[Ntr:] X_test = X[idx_test,:] X_train = X[idx_train,:] y_test = y[idx_test] y_train = y[idx_train] logger.info('split done check split') # check split (rows_test, cols_test) = np.shape(X_test) (rows_train, cols_train) = np.shape(X_train) if rows_train < rows_train: logger.warning('test set larger than training set') logger.info('size of X_test is %d and X_train %d, #fts %d',rows_test,rows_train,cols_train) return X_test, X_train, y_train, y_test
def kmeansClustering(X, k, maxIters = 10): """ X: data in shape N X D k: number of cluster """ my_io.startLog(__name__) logger = logging.getLogger(__name__) X = normalize(X) #X = np.transpose(X) (N, D) = np.shape(X) ax = plt.gca() # colors = ['r' if i==0 else 'g' for i in ?] colors = 'r' ax.scatter(X[:,0], X[:,1], c=colors,alpha=0.8) # plt.show() # initialize # D = np.zeros((0,1)) min_x = np.amin(X) max_x = np.amax(X) # shape of mu: K X D mu_old = np.zeros((k, D)) # mu_old = [ randomInit(max_x, min_x, D) for idk in range(k)] # mu_old = np.array(mu_old) mu_old = np.array(random.sample(X,k)) logger.info('randomly initialize mean vector, check dimension: ') logger.info('pass' if ((k, D) == np.array(mu_old)).all else logger.error('failed')) logger.info('maximum iteration: %s start iteration', str(maxIters)) loss = np.zeros((N,1)) for i in range(maxIters): (Ln, r, Mu) = kmeans_update(X, mu_old) loss[i] = Ln logger.info('iteration %d, loss %4f \n', i, Ln) mu_old = Mu logger.info('done loss %3f', Ln) ax = plt.gca() colors_list = ['r','g','b','y'] colors = [colors_list[int(r[i])] for i in range(N)] # colors = 'r' ax.scatter(X[:,0], X[:,1], c=colors,alpha=0.8) # plt.show() return X, r
def gmmClustering(X, k = 2, maxiter = 3): my_io.startLog(__name__) logger = logging.getLogger(__name__) X, r = kmeans.kmeansClustering(X, 2, 1) (N, D) = np.shape(X) pi_k_old = [np.divide(len(np.where(r==kth)[0]), float(N)) for kth in range(k) ] # mu_old = compute_muk(X, k, r) cova_old, mu_old = compute_cova(k, X, r) for i in range(maxiter): #if i==1: # pdb.set_trace() logger.info('ite: %d loss: %f',i, loss(X, mu_old, pi_k_old, cova_old) ) p = gmm_Esteps(X, pi_k_old, k, cova_old, mu_old) mu_new, cova_new, pi_k_new = gmm_Msteps(mu_old, X, p) pi_k_old = pi_k_new mu_old = mu_new cova_old = cova_new #matplotlib.cm=get_cmap("jet") cm = plt.get_cmap('jet') ax = plt.gca() # colors = ['r' if i==0 else 'g' for i in ?] #colors = 'r' #ax.scatter(X[:,0], X[:,1], c=colors,alpha=0.8) # plt.show() for j in range(N): likehood = p[1][j] color = cm(likehood) plt.plot(X[j,0], X[j,1] ,"o", color=color) plt.show() for j in range(N): likehood = p[0][j] color = cm(likehood) plt.plot(X[j,0], X[j,1] ,"o", color=color) plt.show()
# svm_bench.py import logging import my_io import classification_baseline from sklearn import svm import numpy as np from sklearn.metrics import classification from numpy import linalg as LA my_io.setUp('./biological_response/') my_io.startLog(__name__) logger = logging.getLogger(__name__) y,X,trainData,testData = my_io.readCsv() portion = 0.2 seed = 1 X_test, X_train, y_train, y_test = classification_baseline.splitData(X,y,portion,seed) logger.info('init svm classifier') svc = svm.SVC(probability = True) logger.info('fitting svc') svc.fit(X_train, y_train) logger.info('start predict') predict_probs = svc.predict_proba(X_test) predict = my_io.toZeroOne(predict_probs) # error = classification.zero_one_loss(y_test, predict) loss = np.subtract(predict,y_test) error = LA.norm(loss)
def kmeans_update(X, Mu): """update r and Mu given X and Mu X is [N D]data Mu is [K D] mean vector r is 1xN responsibility vector, e.g. r = [1,2,1] for 2 clusters 3 data points Ln is 1xN minimum distance to its center for each point n """ my_io.startLog(__name__) logger = logging.getLogger(__name__) # initialize (K, D1) = np.shape(Mu) (N, D) = np.shape(X) logger.info('check shape mu: %s X: %s', str(np.shape(Mu)),str(np.shape(X))) logger.info('pass' if D1==D else logger.error('failed')) r = np.zeros((1, N)) # Mu = zeros(D,K) Ln = np.zeros((N,1)) r = np.zeros((N,1)) dis2Muk = np.zeros((K,1)) for n in range(0, N): """for each point assign x_n to the nearest group """ xn = X[n,:] # for each cluster, compute the error for k in range(0, K): """for each group calculate the distane from point x_n to mu_k """ dis2Muk[k] = LA.norm(np.subtract(xn, Mu[k,:])) # print('dis to cluster %d is %.2f \n',k,dis2Muk(k)) # np.amin(a, axis=1) return Minima along the second axis # (i,j)=np.unravel_index(dis2Muk.argmin(), dis2Muk.shape) """ find the minimum distance, ie. the nearest group assigh r_n """ indexk = np.argmin(dis2Muk) # (minVal, indexK) = min(dis2Muk) # assigh cluster # compute r try: r[n] = indexk except Exception as err: logger.exception('distance to mean vector should be 1D array,' ' now ', np.shape(dis2Muk)) # fprintf('assign to cluster: %d \n',r(n)) # compute Mu for each k """ for each group: update mean vector """ # r is assighment of xk # pdb.set_trace() for k in range(0, K): # indexArray = r == k xk = [X[idx,:] for idx in range(len(r)) if r[idx] == k] # xk: Nk x D logger.info(' in group %d, num of points %s', k, str(np.shape(xk))) Mu[k,:] = np.mean(xk, 0) # 0: cal mean along each column """for each point, cal the distance to new mean vector u_k, k = r[n] """ Ln = [LA.norm(np.subtract(X[n, :], Mu[int(r[n]), :])) for n in range(0, N)] logging.info('cal disortion measurement for each x,' ' check dimension OK' if len(Ln) == n else 'error here') # Ln[n] = norm(X[:, n]-Mu[:, r(n)]) # print Ln Ln_mean = np.mean(Ln) return Ln_mean, r, Mu