############################# Parameter Setting ############################## # The number of clusters in KMeans K = 4 ############################################################################## t0 = time() # Load training set file_data_train = open("/home/changyale/dataset/COPDGene/data_"+\ "train_continuous.pkl","rb") data_con_use, features_name_use = pickle.load(file_data_train) # Prepare reference dataset for continuous features # Random sample with replacement from training set to form a reference dataset data_con_use_ref = np.zeros((data_con_use.shape[0], data_con_use.shape[1])) for j in range(data_con_use.shape[1]): tp_index = sample_wr(range(data_con_use_ref.shape[0]),\ data_con_use_ref.shape[0]) for i in range(len(tp_index)): data_con_use_ref[i, j] = data_con_use[tp_index[i], j] t1 = time() print(["Preparing data takes " + str(t1 - t0) + " seconds"]) # Forward search for continuous features # Normalization of the continuous dataset data = scale(data_con_use) n_instances, n_features = data.shape data_ref = scale(data_con_use_ref) # Start with the empty feature set bfs = []
data_con, features_name_con, features_type_con = info_con # Choose only 'continuous' features for backward search data_con_use = [] features_name_use = [] for j in range(len(features_type_con)): if features_type_con[j] == 'continuous': data_con_use.append(data_con[:,j]) features_name_use.append(features_name_con[j]) data_con_use = np.array(data_con_use).T # Prepare reference dataset for continuous features # Random sample with replacement from training set to form a reference dataset data_con_use_ref = np.zeros((data_con_use.shape[0],data_con_use.shape[1])) for j in range(data_con_use.shape[1]): tp_index = sample_wr(range(data_con_use_ref.shape[0]),\ data_con_use_ref.shape[0]) for i in range(len(tp_index)): data_con_use_ref[i,j] = data_con_use[tp_index[i],j] t1 = time() print(["Preparing data takes "+str(t1-t0)+" seconds"]) # Forward search for continuous features # Normalization of the continuous dataset data = scale(data_con_use) #data = data_con_use n_instances, n_features = data.shape data_ref = scale(data_con_use_ref) # Obtain gold value
file_hsic.close() file_hsic = open("data/mtr_hsic_nhsic_dis.pkl", "rb") mtr_hsic_dis, mtr_nhsic_dis = pickle.load(file_hsic) file_hsic.close() # Load information about continuous and discrete features file_data_train = open("/home/changyale/dataset/COPDGene/data_train.pkl", "rb") info_con, info_dis, gold = pickle.load(file_data_train) file_data_train.close() data_con, features_name_con, features_type_con = info_con data_dis, features_name_dis, features_type_dis = info_dis # Random sample with replacement from data_train to form a reference dataset data_train_ref = np.zeros((data_train.shape[0], data_train.shape[1])) for j in range(data_train_ref.shape[1]): tp_index = sample_wr(range(data_train_ref.shape[0]), data_train_ref.shape[0]) for i in range(len(tp_index)): data_train_ref[i, j] = data_train[tp_index[i], j] # Label data_train as class 0 and data_train_ref as class 1, resulting in a # dataset "data_use" and its label "labels" labels = [] data_use = np.zeros((data_train.shape[0]+data_train_ref.shape[0],\ data_train.shape[1])) for i in range(data_train.shape[0]): data_use[i, :] = data_train[i, :] labels.append(0) for i in range(data_train_ref.shape[0]): data_use[data_train.shape[0] + i, :] = data_train_ref[i, :] labels.append(1) labels = np.array(labels)
file_hsic.close() file_hsic = open("data/mtr_hsic_nhsic_dis.pkl","rb") mtr_hsic_dis,mtr_nhsic_dis = pickle.load(file_hsic) file_hsic.close() # Load information about continuous and discrete features file_data_train = open("/home/changyale/dataset/COPDGene/data_train.pkl","rb") info_con,info_dis,gold = pickle.load(file_data_train) file_data_train.close() data_con, features_name_con, features_type_con = info_con data_dis, features_name_dis, features_type_dis = info_dis # Random sample with replacement from data_train to form a reference dataset data_train_ref = np.zeros((data_train.shape[0],data_train.shape[1])) for j in range(data_train_ref.shape[1]): tp_index = sample_wr(range(data_train_ref.shape[0]),data_train_ref.shape[0]) for i in range(len(tp_index)): data_train_ref[i,j] = data_train[tp_index[i],j] # Label data_train as class 0 and data_train_ref as class 1, resulting in a # dataset "data_use" and its label "labels" labels = [] data_use = np.zeros((data_train.shape[0]+data_train_ref.shape[0],\ data_train.shape[1])) for i in range(data_train.shape[0]): data_use[i,:] = data_train[i,:] labels.append(0) for i in range(data_train_ref.shape[0]): data_use[data_train.shape[0]+i,:] = data_train_ref[i,:] labels.append(1) labels = np.array(labels)
list_m_N = [] list_n_iter = [] list_time = [] for experiment in range(n_experiment): print experiment,"Iteration" n_iter = 0 max_iter = n_instances tol = 0.0001 flag_ratio = np.infty t1 = time() while n_iter<max_iter and flag_ratio>tol: n_iter += 1 # sample a data point x_n uniformly from the dataset index_row = sample_wr(range(n_instances),1) x_n = data[index_row,:].T # Derive S_N and m_N from global parameters(Just different representation) S_N = -0.5*np.linalg.inv(lambda_1) m_N = np.dot(S_N,lambda_0) # Update local variables tp = S_N+m_N.reshape(n_features,1)*m_N.reshape(1,n_features) ks = np.dot(np.dot(x_n.T,tp),x_n) var_local_n = np.sqrt(ks) # Compute the intermediate global parameters as though x_n is replicated N # times tp_lambda_0 = np.dot(S_0_inv,m_0)+n_instances*(label[index_row[0]]-0.5)*x_n tp = 0.5/var_local_n*(1./(1+math.exp(-var_local_n))-0.5)
from sklearn.preprocessing import scale from sklearn.metrics import normalized_mutual_info_score import matplotlib.pyplot as plt from python.COPDGene.utils.sample_wr import sample_wr # import iris dataset iris = datasets.load_iris() data_raw = iris.data labels_true = iris.target # Normalization of the original dataset data = scale(data_raw) # extract reference distribution data_ref = [] tp_row_id = sample_wr(range(data.shape[0]),data.shape[0]) for i in range(len(tp_row_id)): data_ref.append(list(data[tp_row_id[i],:])) data_ref = np.array(data_ref) n_clusters_range = range(2,11) inertia = [0]*len(n_clusters_range) inertia_ref = [0]*len(n_clusters_range) score = [0]*len(n_clusters_range) for i in range(len(n_clusters_range)): # Apply Kmeans on the original dataset estimator = KMeans(n_clusters=n_clusters_range[i],init='random',\ n_init=10,n_jobs=-1) estimator.fit(data) inertia[i] = estimator.inertia_