def preprocess_sachs(folder_path, save_path, num_envs, normalization='standard'): """Preprocesses Sachs. Args: folder_path: Read sachs from the path save_path: Save data in this path num_envs: The number of environments to cluster the data into normalization: Normalization option Returns: The sachs dataset with different envs [num_envs, number of sample in each envs, num of features] """ np.set_printoptions(precision=3) X = read_sachs_all(folder_path) _, d = X.shape kmeans = KMeans(n_clusters=num_envs, max_iter=1000).fit(X) labeled_X = kmeans.fit_predict(X) X_envs = utils.classify_x(X, labeled_X) # X_cluster is a dict X, Y = utils.preprocess_labeled_data(X, labeled_X, normalization) X_res, Y_res = utils.over_sampling(X, Y) X_envs = utils.classify_x(X_res, Y_res) os.makedirs(save_path, exist_ok=True) for i, X in X_envs.items(): exp = save_path + f'sachs_env_{i+1}_{num_envs}.csv' if X.shape[0] > 1: np.savetxt(exp, X, delimiter=',') return utils.distribute_to_envs(X_envs)
def preprocess_BH(save_path, cluster): """Clusters the data and prerpocess it. Args: save_path: the path to save the proprocessed data cluster: number of clusters to cluster the dataset Returns: A tensor of [num_envs, number of sample in each envs, num of features] """ np.set_printoptions(precision=3) x_raw = utils.load_BH() label = np.ones(len(x_raw)) # only train and test x, x_test, _, _ = train_test_split( x_raw, label, test_size=0.1, random_state=42) # train val and test # X_train_val, X_test, _, _ = train_test_split( # X_raw, label, test_size=0.1, random_state=42) # X, X_val, _, _ = train_test_split( # X_train_val, np.ones(len(X_train_val)), test_size=0.1, random_state=1) n, d = x.shape kmeans = KMeans(n_clusters=cluster, max_iter=1000).fit(x) labeled_x = kmeans.fit_predict(x) x_cluster = utils.classify_x(x, labeled_x) # X_cluster is a dict os.makedirs(save_path, exist_ok=True) x_upsampled, y_upsampled = utils.over_sampling(x, labeled_x) x_envs = utils.classify_x(x_upsampled, y_upsampled) standard_scaler = preprocessing.StandardScaler() x_train = list() i = 1 # Save the data for different envs for x_env in x_envs.items(): standardx = standard_scaler.fit_transform(x_env[1]) exp = save_path + f'standard_BH_env_{i}_{cluster}.csv' np.savetxt(exp, standardx, fmt='%.3f', delimiter=',') i += 1 x_train.append(standardx) # Standard the train test dataset using the mean and std of the train dataset standard_train_param = standard_scaler.fit(x) standardxtrain = standard_scaler.transform(x) x_trainexp = save_path + f'standard_BH_train.csv' np.savetxt(x_trainexp, standardxtrain, fmt='%.3f', delimiter=',') standardxtest = standard_scaler.transform(x_test) x_testexp = save_path + f'standard_BH_test.csv' np.savetxt(x_testexp, standardxtest, fmt='%.3f', delimiter=',') return np.stack(x_train), x_testexp
def main(): loader = DataLoader(config.pos_path, config.nag_path) # 正负样本生成器 train_data, labels = loader.load_data() skf = StratifiedKFold(n_splits=config.kfold, shuffle=True, random_state=2017) for i, (train_idx, vali_idx) in enumerate( skf.split(train_data, np.zeros((len(labels), ))), 1): train_x, train_y = train_data[train_idx], labels[ train_idx] # 交叉验证训练集部分 train_x, train_y = over_sampling(train_x, train_y, config.pos_nag_rato) # 数据过采样到要求比例 vali_x, vali_y = train_data[vali_idx], labels[vali_idx] # 交叉验证验证集部分 # vali_x, vali_y = over_sampling(vali_x,vali_y,config.pos_nag_rato) # 验证集不用过采样时请将此行注释 model = MedlatModel("./cache/models/cnn_model_%d.h5" % (i)) train_x = loader.get_inputs_set(train_x) vali_x = loader.get_inputs_set(vali_x) train_y = np.array([[0., 1.] if train_y[x] == 1.0 else [1., 0.] for x in range(len(train_y))]) vali_y = np.array([[0., 1.] if vali_y[x] == 1.0 else [1., 0.] for x in range(len(vali_y))]) model.train_model(train_x, train_y, vali_x, vali_y) data_y, pred = model.evaluate(vali_x, vali_y) if i == 1: all_pred_y = pred all_real_y = data_y else: all_pred_y = np.concatenate((all_pred_y, pred), axis=0) all_real_y = np.concatenate((all_real_y, data_y), axis=0) print "*" * 20 + " %d fold end " % (i) + "*" * 20 # if i >= 1: break #交叉验证时请注释此行 ######################### 总评估方法 ######################### roc_df = pd.DataFrame(all_pred_y, columns=["prob_0", "prob_1"]) roc_df["real"] = all_real_y roc_df.to_csv(config.evaluate_file, index=False, index_label=False) print("%s had saved. Please download it." % (config.evaluate_file)) all_pred_y = np.argmax(all_pred_y, axis=1) confuse_matrix = metrics.confusion_matrix(all_real_y, all_pred_y) TN, FP, FN, TP = confuse_matrix.ravel() print "confusion matrix: \n", confuse_matrix print "accuracy score: ", metrics.accuracy_score(all_real_y, all_pred_y) print "classification report:\n", metrics.classification_report( all_real_y, all_pred_y) print "F1 score: ", metrics.f1_score(all_real_y, all_pred_y) print "recall: ", 1. * TP / (TP + FN) print "False positive rate: ", 1. * FP / (FP + TN)
def cluster_Insurance(x, num_cluster): """Clusters and returns the data clustered into different environments. Args: x: the data to be clustered num_cluster: the number of clusters Returns: np.array: the data clustered into different environments """ _, d = x.shape kmeans = KMeans(n_clusters=num_cluster, max_iter=1000).fit(x) labeled_x = kmeans.fit_predict(x) x_cluster = utils.classify_x(x, labeled_x) # X_cluster is a dict x_upsampled, y_upsampled = utils.over_sampling(x, labeled_x) x_envs = utils.classify_x(x_upsampled, y_upsampled) return x_envs
def main(): path, grid_obs, start, goal = phi_star_gen() path = np.array(over_sampling([p.pos for p in path], max_length=1)) # path[5, :] = np.array([2, 5]) # path[13, :] = np.array([5, 17]) # path[20, :] = np.array([15, 2]) # path = np.array([[start[0], start[1]], [goal[0], goal[1]]]) # path = over_sampling(path, max_length=1) distObs = euclideanDistanceTransform(grid_obs) pathOptimized = optimTrajectory(path, distObs, grid_obs, trajDuration=10) print(path, pathOptimized) smoothed = bsplineUpsample(pathOptimized)
def read_sachs_to_envs(folder_path, num_envs, normalization): """Loads Sachs data and return sachs data divided into different environments. Args: folder_path: num_envs: the number of envs to read the data normalization: normalization type Returns: A tensor with shape [num_envs, number of sample in each envs, num of features] """ sachs_data = list() if num_envs == 14: y_label = [] for i, file in enumerate(glob.glob(f'{folder_path}*.xls')): sachs_df = pd.read_excel(file) sachs_array = sachs_df.to_numpy() sachs_array = utils.preprocess(sachs_array, normalization) sachs_data.append(sachs_array) y_label.append(np.ones(sachs_array.shape[0]) * i) sachs_data_envs = np.vstack(sachs_data) sachs_data_labels = np.hstack(y_label) X_res, Y_res = utils.over_sampling(sachs_data_envs, sachs_data_labels) X_cluster = utils.classify_x(X_res, Y_res) X_envs = utils.distribute_to_envs(X_cluster) elif num_envs == 2: X_envs = [None] * num_envs y_label = [None] * num_envs for i, file in enumerate(glob.glob(f'{folder_path}*.xls')): start_index = file.index('sachs_data/') + 11 end_index = file.index(' ') - 1 file_index = int(file[start_index:end_index]) label = 0 if file_index <= 9 else 1 sachs_df = pd.read_excel(file) sachs_array = sachs_df.to_numpy() if X_envs[label] is None: X_envs[label] = sachs_array y_label[label] = np.ones(sachs_array.shape[0]) * label else: X_envs[label] = np.concatenate((X_envs[label], sachs_array), axis=0) y_label[label] = np.concatenate( (y_label[label], (np.ones(sachs_array.shape[0]) * label)), axis=0) for i in range(num_envs): X_envs[i], y_label[i] = utils.preprocess_labeled_data( X_envs[i], y_label[i], normalization) X = np.vstack(X_envs) Y = np.hstack(y_label) X_res, Y_res = utils.over_sampling(X, Y) X_cluster = utils.classify_x(X_res, Y_res) X_envs = utils.distribute_to_envs(X_cluster) elif num_envs == 3: if not os.path.exists(f'./data/cluster/*3.csv'): X_envs = preprocess_sachs(folder_path, f'./data/cluster/', 3, normalization) else: for file in glob.glob(f'./data/cluster/*3.csv'): sachs_array = np.loadtxt(file, delimiter=',') # sachs_array = preprocess(sachs_array, args=args) sachs_data.append(sachs_array) X_envs = np.stack(sachs_data) elif num_envs == 6: if not os.path.exists(f'./data/cluster/*3.csv'): X_envs = preprocess_sachs(folder_path, f'./data/cluster/', 6, normalization) else: for file in glob.glob(f'./data/cluster/*6.csv'): sachs_array = np.loadtxt(file, delimiter=',') sachs_array = utils.preprocess(sachs_array, normalization) sachs_data.append(sachs_array) X_envs = np.stack(sachs_data) elif num_envs == 7: X_envs = [None] * num_envs y_label = [None] * num_envs for i, file in enumerate(glob.glob(f'{folder_path}*.xls')): start_index = file.index('sachs_data/') + 11 end_index = file.index(' ') - 1 file_index = int(file[start_index:end_index]) label = file_index % num_envs sachs_df = pd.read_excel(file) sachs_array = sachs_df.to_numpy() if X_envs[label] is None: X_envs[label] = sachs_array y_label[label] = np.ones(sachs_array.shape[0]) * label else: X_envs[label] = np.concatenate((X_envs[label], sachs_array), axis=0) y_label[label] = np.concatenate( (y_label[label], (np.ones(sachs_array.shape[0]) * label)), axis=0) for i in range(num_envs): X_envs[i], y_label[i] = utils.preprocess_labeled_data( X_envs[i], y_label[i], normalization) X = np.vstack(X_envs) Y = np.hstack(y_label) X_res, Y_res = utils.over_sampling(X, Y) X_cluster = utils.classify_x(X_res, Y_res) X_envs = utils.distribute_to_envs(X_cluster) return X_envs