def save_results_for_KM(root_dir, res_dict, method_name, dat_name): """ The function is used to save generated results for K-means and its variants """ res_dir = os.path.join( root_dir, 'results', method_name, dat_name) # get the result directory where the result is stored f_manager = FileManager(res_dir) f_path = os.path.join(res_dir, 'cls_quality.csv') f_manager.add_file(f_path) print f_path # Then, we save the results to one csv file like # "seed_num" "time" "Purity" "ARI" "ACC" "NMI" ... # 1 1 000 000 000 000 000 ... # 2 2 000 000 000 000 000 ... field_names = ['seed', 'time', 'Purity', 'ARI', 'ACC', 'NMI', 'd_W'] # fill out the field names for CSV with open(f_path, mode='wb') as csv_file: # open the file, if not exist, create it writer = csv.DictWriter( csv_file, fieldnames=field_names ) # create a writer which maps the dictionaries onto output rows in CSV writer.writeheader() # write the field names to the header for key in res_dict.keys(): writer.writerow(res_dict[key])
def full_db(): fm = FileManager('sqlite://') fm.init_db() for flow_id, object_name, size in FILES: parts = flow_id.split('/') dataset_id = '/'.join(parts[:2]) object_name = '/'.join([dataset_id, object_name]) FILE_TO_FLOW.setdefault(object_name, set()).add(flow_id) fm.add_file('bucket', object_name, PRIVACY[dataset_id], parts[0], parts[0] + '_id', dataset_id, flow_id, size, now) return fm
def gather_results_by_seeds(root_dir, seeds): ''' This function is used to gather results generated by different initializations on NCP methods Args: root_dir the parent dir seeds a list of seeds used Returns: a csv files generated with results collected from all files ''' size = len(seeds) # the number of initializations nmf_palm = np.ones((8000, size + 2)) * (-1) nmf_sncp = np.ones((8000, size + 2)) * (-1) onmf_cost_palm = np.ones((8000, size + 2)) * (-1) onmf_cost_sncp = np.ones((8000, size + 2)) * (-1) cost_palm = np.ones((8000, size + 2)) * (-1) cost_sncp = np.ones((8000, size + 2)) * (-1) WH_nr = np.ones((8000, size + 2)) * (-1) ortho_nr = np.ones((8000, size + 2)) * (-1) cls_acc = np.ones((8000, size + 2)) * (-1) for seed in seeds: res_path = os.path.join(root_dir, 'seed' + str(seed), 'res.csv') cls_path = os.path.join(root_dir, 'seed' + str(seed), 'cls_quality.csv') if not os.path.exists(res_path) or not os.path.exists(cls_path): raise ValueError('Error: the result path cannot be found!') df = pd.read_csv(res_path, header=0) # convet the dataframe into a numpy mat res_arr = df.as_matrix() #print res_arr[0, 0] dim = res_arr.shape[0] # move the nmf cost VS palm nmf_palm[0:dim, seed - 1] = res_arr[:, 0] # move the cost VS palm cost_palm[0:dim, seed - 1] = res_arr[:, 1] # move the nmf cost VS sncp nmf_sncp[0:dim, seed - 1] = res_arr[:, 2] # move the onmf cost VS SNCP onmf_cost_sncp[0:dim, seed - 1] = res_arr[:, 3] # move the onmf cost VS PALM onmf_cost_palm[0:dim, seed - 1] = res_arr[:, 4] # move the cost VS SNCP cost_sncp[0:dim, seed - 1] = res_arr[:, 5] # move the WH_nr WH_nr[0:dim - 1, seed - 1] = res_arr[1:, 12].astype(float) + res_arr[1:, 13].astype(float) # move the ortho nr ortho_nr[0:dim, seed - 1] = res_arr[:, 14] df2 = pd.read_csv(cls_path, header=0) cls_arr = df2.as_matrix() # move the cluster accuracy VS SNCP dim = cls_arr.shape[0] cls_acc[0:dim, seed - 1] = cls_arr[:, 5] f_manager = FileManager(root_dir) nmf_palm_path = os.path.join(root_dir, 'nmf_cost_palm.csv') f_manager.add_file(nmf_palm_path) m01 = np.mean(nmf_palm[:, 0:size], axis=1) std01 = np.std(nmf_palm[:, 0:size], axis=1) nmf_palm[:, size] = m01.T nmf_palm[:, size + 1] = std01.T np.savetxt(nmf_palm_path, np.asmatrix(nmf_palm), delimiter=',', fmt='%.30f') nmf_sncp_path = os.path.join(root_dir, 'nmf_cost_sncp.csv') f_manager.add_file(nmf_sncp_path) m02 = np.mean(nmf_sncp[:, 0:size], axis=1) std02 = np.std(nmf_sncp[:, 0:size], axis=1) nmf_sncp[:, size] = m02.T nmf_sncp[:, size + 1] = std02.T np.savetxt(nmf_sncp_path, np.asmatrix(nmf_sncp), delimiter=',', fmt='%.30f') onmf_palm_path = os.path.join(root_dir, 'onmf_cost_palm.csv') f_manager.add_file(onmf_palm_path) m1 = np.mean(onmf_cost_palm[:, 0:size], axis=1) std1 = np.std(onmf_cost_palm[:, 0:size], axis=1) onmf_cost_palm[:, size] = m1.T onmf_cost_palm[:, size + 1] = std1.T np.savetxt(onmf_palm_path, np.asmatrix(onmf_cost_palm), delimiter=',', fmt='%.30f') onmf_sncp_path = os.path.join(root_dir, 'onmf_cost_sncp.csv') f_manager.add_file(onmf_sncp_path) m2 = np.mean(onmf_cost_sncp[:, 0:size], axis=1) std2 = np.std(onmf_cost_sncp[:, 0:size], axis=1) onmf_cost_sncp[:, size] = m2.T onmf_cost_sncp[:, size + 1] = std2.T np.savetxt(onmf_sncp_path, np.asmatrix(onmf_cost_sncp), delimiter=',', fmt='%.30f') sncp_path = os.path.join(root_dir, 'cost_sncp.csv') f_manager.add_file(sncp_path) m3 = np.mean(cost_sncp[:, 0:size], axis=1) std3 = np.std(cost_sncp[:, 0:size], axis=1) cost_sncp[:, size] = m3.T cost_sncp[:, size + 1] = std3.T np.savetxt(sncp_path, np.asmatrix(cost_sncp), delimiter=',', fmt='%.30f') palm_path = os.path.join(root_dir, 'cost_palm.csv') f_manager.add_file(palm_path) m4 = np.mean(cost_palm[:, 0:size], axis=1) std4 = np.std(cost_palm[:, 0:size], axis=1) cost_palm[:, size] = m4.T cost_palm[:, size + 1] = std4.T np.savetxt(palm_path, np.asmatrix(cost_palm), delimiter=',', fmt='%.30f') WH_nr_path = os.path.join(root_dir, 'WH_NR.csv') f_manager.add_file(WH_nr_path) m5 = np.mean(WH_nr[:, 0:size], axis=1) std5 = np.std(WH_nr[:, 0:size], axis=1) WH_nr[:, size] = m5.T WH_nr[:, size + 1] = std5.T np.savetxt(WH_nr_path, np.asmatrix(WH_nr), delimiter=',', fmt='%.30f') ortho_nr_path = os.path.join(root_dir, 'ortho_NR.csv') f_manager.add_file(ortho_nr_path) m6 = np.mean(ortho_nr[:, 0:size], axis=1) std6 = np.std(ortho_nr[:, 0:size], axis=1) ortho_nr[:, size] = m6.T ortho_nr[:, size + 1] = std6.T np.savetxt(ortho_nr_path, np.asmatrix(ortho_nr), delimiter=',', fmt='%.30f') cls_acc_path = os.path.join(root_dir, 'cls_acc.csv') f_manager.add_file(cls_acc_path) m7 = np.mean(cls_acc[:, 0:size], axis=1) std7 = np.std(cls_acc[:, 0:size], axis=1) cls_acc[:, size] = m7.T cls_acc[:, size + 1] = std7.T np.savetxt(cls_acc_path, np.asmatrix(cls_acc), delimiter=',', fmt='%.30f')
def gen_inits_WH(self, init='random', seed=1, H_ortho=True): ''' The function is to initialize the factors W, H for nonnegative matrix factorization There are some options: 1. random ------ generate W, H randomly 2. kmeans ------ generate H based on cluster assignments obtained by Kmeans then W = data_mat * H (since H is orthogonal) 3. nmf ------ use sklearn.nmf on data matrix firstly to get W, H for initialization 4. kmeans++ ---- use heuristic strategy kmeans++ to get cluster assignment which can be used for H and W = data_mat * H Args: data (numpy array or mat): the input data init (string): the name of method used for generating the initializations rank (int): the rank for decomposition seed (float): the seed for random generator Returns: numpy matrix W and H ''' ortho = 'ortho' if H_ortho else '' data_name = self.data_kind + str(self.data_num) initW_path = os.path.join(self.root_dir, 'inits', data_name, 'W' + str(seed) + '.csv') initH_path = os.path.join(self.root_dir, 'inits', data_name, 'H' + '_' + ortho + str(seed) + '.csv') if os.path.exists(initW_path) and os.path.exists(initH_path): if seed < 100: W_init = self.read_data_from_csvfile(initW_path) H_init = self.read_data_from_csvfile(initH_path) else: ( m, n ) = self.data_mat.shape # get the size of data matrix to be decomposed np.random.seed(seed) if init == 'random': abs_mat = np.absolute(self.data_mat) #print np.any(abs_mat < 0) avg = np.sqrt(abs_mat.mean() / self.num_of_cls) print 'mean: ' + str(abs_mat.mean()) print 'rank: ' + str(self.num_of_cls) print 'avg: ' + str(avg) W_init = np.asmatrix(avg * np.random.random( (m, self.num_of_cls))) H_init = np.asmatrix(avg * np.random.random( (n, self.num_of_cls))) elif init == 'kmeans': km = sklearn_KMeans(n_clusters=self.num_of_cls).fit( self.data_mat.transpose()) clusters = km.predict(self.data_mat.transpose()) H_init = np.asmatrix(np.zeros((n, self.num_of_cls))) for i in range(len(clusters)): H_init[i, clusters[i]] = 1 H_init = H_init * np.diag( np.diag(H_init.transpose() * H_init)**(-0.5)) W_init = self.data_mat * H_init elif init == 'nmf': model = sklearn_NMF(n_components=self.num_of_cls, init='nndsvd', random_state=0) W = model.fit_transform(self.data_mat.transpose()) H = model.components_ H_init = np.asmatrix(W) W_init = np.asmatrix(H).transpose() elif init == 'kmeans++': print 'using k++ initialization....' data_mat = self.data_mat.transpose() initial_centroids = np.ones((self.num_of_cls, m)) * (-1) ind_list = [] idx = np.random.choice(n) ind_list.append(idx) initial_centroids[0, :] = data_mat[idx, :] while len(ind_list) < self.rank: cent = initial_centroids[0:len(ind_list), :] D2 = np.array([ min([LA.norm(x - c)**2 for c in cent]) for x in data_mat ]) probs = D2 / D2.sum() cumprobs = probs.cumsum() #r = random.random() r = np.random.random() idx = np.where(cumprobs >= r)[0][0] ind_list.append(idx) initial_centroids[len(ind_list) - 1, :] = data_mat[idx, :] print ind_list W_init = np.asmatrix(initial_centroids).transpose() distances = np.ones((m, self.num_of_cls)) * (-1) for centroid_idx in range(self.num_of_cls): for data_idx in range(n): distances[data_idx, centroid_idx] = LA.norm( data_mat[data_idx, :] - initial_centroids[centroid_idx, :]) cluster_assignments = np.argmin(distances, axis=1) temp_H = np.asmatrix(np.zeros((n, self.num_of_cls))) for j in range(n): temp_H[j, cluster_assignments[j]] = 1 #temp_H = np.diag(np.diag(temp_H * temp_H.transpose()) ** (-0.5)) * temp_H H_init = np.asmatrix(temp_H) else: raise ValueError( 'Error: invalid int parameter - init (None, random, kmeans, nmf)!!' ) H_init = np.asmatrix(H_init.transpose()) if H_ortho: #H_init = np.asmatrix(H_init.transpose()) (ha, hb) = H_init.shape ortho = LA.norm( H_init * H_init.transpose() - np.asmatrix(np.eye(ha)), 'fro') print H_init * H_init.transpose() if ortho > 1e-6: H = np.zeros((ha, hb)) ind = np.asarray(np.argmax(H_init, 0))[0, :] for j in range(hb): H[ind[j], j] = 1 H = np.asmatrix(H) temp = np.diag(H * H.transpose()) if np.any(temp == 0): print temp raise ValueError("some rows of H are zeros!!!") H = np.asmatrix(np.diag(temp**(-0.5))) * H H_init = H if seed >= 100: np.random.seed(seed) (m, n) = self.data_mat.shape # find centers from the smallest clusters cls_idx, cls_sizes = np.unique(self.true_labels, return_counts=True) s_id = cls_idx[np.argmax(cls_sizes)] id_list = np.where(self.true_labels == s_id)[0] print s_id print id_list dis_mat = pdist(self.data_mat.transpose()) print np.argmin(dis_mat) print np.unravel_index(dis_mat.argmin(), dis_mat.shape) print np.where(dis_mat == np.min(dis_mat[np.nonzero(dis_mat)])) print 'select initial points -----' select_idx = [997, 998, 999] print select_idx #print id_list #select_idx = np.random.choice(id_list, self.num_of_cls, replace = False) W_init = self.data_mat[:, select_idx] #raise ValueError('TTEST!') W_init = np.asmatrix(W_init) print W_init.shape # save generated initializations f_manager = FileManager(self.root_dir) f_manager.add_file(initW_path) np.savetxt(initW_path, np.asmatrix(W_init), delimiter=',') f_manager.add_file(initH_path) np.savetxt(initH_path, np.asmatrix(H_init), delimiter=',') return np.asmatrix(W_init), np.asmatrix(H_init)
def __init__(self, root_dir, is_real, data_kind, data_num, has_outliers = True, dim_reduced = False, \ num_of_features = 2000, num_of_samples = 1000, num_of_cls = 10, seed = 0): self.root_dir = root_dir self.is_real = is_real self.num_of_cls = num_of_cls self.data_kind = data_kind self.data_num = data_num self.has_outliers = has_outliers self.num_of_features = num_of_features self.num_of_samples = num_of_samples self.dim_reduced = dim_reduced dr_str = 'DR' if dim_reduced else '' outliers = 'otlrs' if has_outliers else '' if is_real: # save the newly generated data so that we don't need to regenerate it again if not self.data_kind in {'mnist', 'tdt2', 'tcga'}: raise ValueError( 'Error: other data kinds are not supported now!') data_path = os.path.join( self.root_dir, 'real_data', self.data_kind, 'data' + dr_str + '#' + str(self.data_num) + '.csv') label_path = os.path.join(self.root_dir, 'real_data', self.data_kind, 'label#' + str(self.data_num) + '.csv') print data_path print label_path print self.root_dir if os.path.exists(data_path): # data file exists, just read it self.data_mat = self.read_data_from_csvfile(data_path) if not self.dim_reduced: if self.data_kind in {'tdt2', 'mnist'}: self.data_mat = self.data_mat.transpose() self.true_labels = self.read_data_from_csvfile(label_path) print 'labels shape: ' + str(self.true_labels.shape) if self.data_kind in {'tdt2', 'mnist'}: self.true_labels = self.true_labels.transpose() self.true_labels = self.true_labels[ 0, :] # since labels are stored as matrix, we just extrac row 0 self.existed = True else: # in case the original dataset without dimension reduction exists print False orig_data_path = os.path.join( self.root_dir, 'real_data', self.data_kind, 'data#' + str(self.data_num) + '_seed' + str(seed) + '.csv') orig_label_path = os.path.join( self.root_dir, 'real_data', self.data_kind, 'label#' + str(self.data_num) + '_seed' + str(seed) + '.csv') if os.path.exists(orig_data_path): data_mat = self.read_data_from_csvfile(orig_data_path) #self.data_mat = self.data_mat.transpose()[:, 0:20001] # just for testing labels = self.read_data_from_csvfile(orig_label_path) print(data_mat.shape) labels = labels.transpose()[0, :] if self.dim_reduced: self.data_mat = self.dim_reduction_by_spectral() self.data_mat = self.data_mat.transpose() f_manager = FileManager(self.root_dir) f_manager.add_file(data_path) np.savetxt(data_path, np.asmatrix(self.data), delimiter=',') f_manager.add_file(label_path) np.savetxt(label_path, np.asmatrix(self.true_labels), delimiter=',') self.existed = False else: raise ValueError('Error: no available datasets') else: print('seed: ' + str(seed)) np.random.seed(seed) # set the seed # at first, we check whether the data file has been generated or not data_path = os.path.join(self.root_dir, 'synthetic_data', self.data_kind + '#' + str(self.data_num) + '_' + dr_str \ + '_' + str(self.num_of_features) + 'x' + str(self.num_of_samples) + '_K' + str(self.num_of_cls) + '_seed' + str(seed) + '.csv') label_path = os.path.join(self.root_dir, 'synthetic_data', self.data_kind + '#' + str(self.data_num) + '_' + dr_str \ + '_' + str(self.num_of_features) + 'x' + str(self.num_of_samples) + '_K' + str(self.num_of_cls) + '_seed' + str(seed) + '_label.csv') print data_path if os.path.exists(data_path): # the data file exists, just read it self.data_mat = self.read_data_from_csvfile(data_path) self.true_labels = self.read_data_from_csvfile(label_path) self.true_labels = self.true_labels[ 0, :] # since labels are stored as matrix, we just extrac row 0 self.existed = True else: if self.data_kind.startswith('syn'): # we should generate synthetic dta with the linear model self.data_mat, self.true_labels = self.gen_data_with_noise(self.num_of_features, self.num_of_samples, self.num_of_cls, \ self.data_num, self.has_outliers) if self.dim_reduced: self.data_mat = self.dim_reduction_by_spectral() self.data_mat = self.data_mat.transpose() elif self.data_kind.startswith('2d'): self.data_mat, self.true_labels = self.gen_2Data_with_3clusters( data_num=self.data_num) else: raise ValueError('Error: no other synthetic datasets!') #print (self.root_dir) f_manager = FileManager(self.root_dir) f_manager.add_file(data_path) np.savetxt(data_path, np.asmatrix(self.data_mat), delimiter=',') f_manager.add_file(label_path) np.savetxt(label_path, np.asmatrix(self.true_labels), delimiter=',') self.existed = False print 'data_mat' print self.data_mat.shape