def CV(X, Y, Model, params, n_splits=3, numlabel=True, shuffle=True): X_std = StandardScaler().fit_transform(X) kf = KFold(n_splits=n_splits, shuffle=shuffle) rec = 0 acc = 0 prec = 0 for train_index_rc, test_index_rc in kf.split(reactantCombination): train_index = [ i for rc in train_index_rc for i in reactantCombination[rc] ] test_index = [ i for rc in test_index_rc for i in reactantCombination[rc] ] X_train, X_test = X_std[train_index], X_std[test_index] Y_train, Y_test = Y[train_index], Y[test_index] model = Model(**params) model.fit(X_train, Y_train) pred = model.predict(X_test) if numlabel: Y_test = numout2boolout(Y_test) pred = numout2boolout(pred) rec += recall_score(Y_test, pred, average='weighted') prec += precision_score(Y_test, pred, average="weighted") acc += accuracy_score(Y_test, pred) return [rec / n_splits, prec / n_splits, acc / n_splits]
def use_SelectKBest(n, print_log=True, interpret_tree=False, cv_author=False): """in: n: int, # of features to be selected print_log: bool out: precision: float, precision_score of svm using choosed features selected: array(bool array), shape(n_features), True for features selected else False """ skb = SelectKBest(k=n) X_trans = skb.fit_transform(X, Y) selected = skb.get_support() if print_log: print('features selected by SelectKBest:') print(X_feature_name[selected]) model = SVC(kernel=PUK_kernel, class_weight="balanced", C=1) model.fit(X_trans, Y) pred = model.predict(skb.transform(x)) precision = precision_score(numout2boolout(y), numout2boolout(pred)) cm = confusion_matrix(numout2boolout(y), numout2boolout(pred)) if print_log: print('precision={0:.3f}'.format(precision)) print("confusion matrix:") print(cm) if cv_author: CV_author(X_trans, Y, 3, SVC, { "kernel": PUK_kernel, "class_weight": "balanced", "C": 1 }) if interpret_tree: tree = reinterpret(X_trans, model, X_trans) plt.figure(dpi=160, figsize=(24, 5)) plot_tree(tree, max_depth=5, feature_names=X_feature_name[selected], rounded=True, filled=True, fontsize=5) plt.savefig("./decision_tree_skb.jpg") return precision, selected
def use_RFE(n, print_log=True, interpret_tree=False): """in: n: int, # of features to be selected print_log: bool out: precision: float, precision_score of svm using choosed features selected: array(bool array), shape(n_features), True for features selected else False """ rfe = RFE(estimator=GradientBoostingClassifier(), n_features_to_select=n) X_trans = rfe.fit_transform(X, Y) selected = rfe.get_support() if print_log: print('features choosed by RFE:') print(X_feature_name[selected]) model = SVC(kernel=PUK_kernel, class_weight="balanced", C=1) model.fit(X_trans, Y) pred = model.predict(rfe.transform(x)) precision = precision_score(numout2boolout(y), numout2boolout(pred)) cm = confusion_matrix(numout2boolout(y), numout2boolout(pred)) if print_log: print('precision={0:.3f}'.format(precision)) print("confusion matrix:") print(cm) if interpret_tree: tree = reinterpret(X_trans, model, X_trans) plt.figure(dpi=160, figsize=(24, 5)) plot_tree(tree, max_depth=5, feature_names=X_feature_name[selected], rounded=True, filled=True, fontsize=5) plt.show() plt.savefig("./decision_tree_rfe.jpg") return precision, selected
rec += recall_score(Y_test, pred, average='weighted') prec += precision_score(Y_test, pred, average="weighted") acc += accuracy_score(Y_test, pred) return [rec / n_splits, prec / n_splits, acc / n_splits] for i, kernel in enumerate([PUK_kernel, "rbf", "sigmoid"]): result = [] for j, C in enumerate(np.logspace(-10, 10, num=20)): # 分为4类的表现 result += CV(X_masked, Y, SVC, { "kernel": kernel, "class_weight": "balanced", "C": C, "gamma": "scale", }, numlabel=True) # 分为2类的表现 result += CV(X_masked, numout2boolout(Y), SVC, { "kernel": kernel, "class_weight": "balanced", "C": C, "gamma": "scale", }, numlabel=False) np.save("./out/SVC_{}.npy".format(kernelName[i]))
pandas 0.25.1 pytorch 1.1.0 scikit-learn 0.21.3 scipy 1.3.1 matplotlib 3.1.1 """ import numpy as np from utils import numout2boolout from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV # 加载数据 x_feature = np.load("./processedData/X/X_featureName.npy") X = np.load("./processedData/X/X_train.npy") x = np.load("./processedData/X/x_test.npy") Y = numout2boolout(np.load("./processedData/Y/Y_train.npy")) y = numout2boolout(np.load("./processedData/Y/y_test.npy")) # 创建随机森林并预测 rf0 = RandomForestClassifier(oob_score=True, random_state=10) rf0.fit(X, Y) print(rf0.oob_score_) print("accuracy:%f" % rf0.oob_score_) rf0.fit(x, y) print(rf0.oob_score_) print("accuracy:%f" % rf0.oob_score_) # 调参 param_test1 = {"n_estimators": range(1, 101, 5)} gsearch1 = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_test1,
def main(): learning_rate = 1e-3 weight_decay = 1e-3 epoches = 300 log_interval = 10 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 加载数据 Y = numout2boolout(np.load("./processedData/Y/Y_train.npy")) y = numout2boolout(np.load("./processedData/Y/y_test.npy")) weights_of_lable = np.zeros(2) print("# of labels in Y:") for i in range(2): num_label = len(Y[Y==i]) print(i, num_label) weights_of_lable[i] = 1 / num_label weights = [weights_of_lable[int(i)] for i in Y] sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights)) X = np.load("./processedData/X_train_masked.npy") x = np.load("./processedData/x_test_masked.npy") #X = np.load("./processedData/X/X_train_reduced.npy") #x = np.load("./processedData/X/x_test_reduced.npy") # 对数据进行特征选择处理 skb = SelectKBest(k=10) X_trans = skb.fit_transform(X, Y) x_trans = skb.transform(x) normalizer = Normalizer() X_trans = normalizer.fit_transform(X_trans) x_trans = normalizer.transform(x_trans) train_data = TensorDataset(torch.tensor(X_trans, dtype=torch.float), torch.tensor(Y, dtype=torch.long)) test_data = TensorDataset(torch.tensor(x_trans, dtype=torch.float), torch.tensor(y, dtype=torch.long)) train_data_loader = DataLoader(train_data, batch_size=16, shuffle=True)#sampler=sampler) test_data_loader = DataLoader(test_data, batch_size=8, shuffle=True) # 构造模型 model = simple_NN(np.shape(X)[1], 8, 8) optimizer = torch.optim.Adam( model.parameters(), lr=learning_rate, weight_decay=weight_decay) #criterion = nn.NLLLoss(reduction='sum') criterion = nn.CrossEntropyLoss() # 训练 model.train() print("Training...") for epoch in range(epoches): loss_sum = 0. for data, label in train_data_loader: #data, label = data.to(DEVICE), label.to(DEVICE) # Forward optimizer.zero_grad() out = model(data) loss = criterion(out, label) loss_sum += loss # Backward loss.backward() optimizer.step() if (epoch + 1) % log_interval == 0: print("Epoch = {0}, loss = {1:.5f}".format( epoch + 1, loss.data.float())) # 测试 # 这一段是为了观察是否有收敛 model.eval() print("Predicting...") with torch.no_grad(): labels, pred = np.array([]), np.array([]) loss_sum = 0. print('result in train') for data, label in train_data_loader: #data, label = data.to(DEVICE), label.to(DEVICE) out = model(data) category = np.argmax(out, axis=1)#.cpu(), axis=1) loss = criterion(out, label) loss_sum += loss#.cpu() labels = np.append(labels, label)#.cpu()) pred = np.append(pred, category) labels_trans = labels pred_trans = pred acc = accuracy_score(labels_trans, pred_trans) cm = confusion_matrix(labels_trans, pred_trans) precision = precision_score(labels_trans, pred_trans, average='micro') recall = recall_score(labels_trans, pred_trans, average='micro') print("Test loss = {0:.5f}".format(loss_sum)) print("Test accuracy = {0:.5f}".format(acc)) np.set_printoptions(precision=5) print('Test cm = ') print(cm) print('Test precision = {0:.5f}'.format(precision)) print('Test recall = {0:.5f}'.format(recall)) # 测试集上的结果 with torch.no_grad(): labels, pred = np.array([]), np.array([]) loss_sum = 0. print('result in test') for data, label in test_data_loader: #data, label = data.to(DEVICE), label.to(DEVICE) out = model(data) category = np.argmax(out, axis=1)#.cpu(), axis=1) loss = criterion(out, label) loss_sum += loss#.cpu() labels = np.append(labels, label)#.cpu()) pred = np.append(pred, category) labels_trans = labels pred_trans = pred acc = accuracy_score(labels_trans, pred_trans) cm = confusion_matrix(labels_trans, pred_trans) precision = precision_score(labels_trans, pred_trans, average='micro') recall = recall_score(labels_trans, pred_trans, average='micro') print("Test loss = {0:.5f}".format(loss_sum)) print("Test accuracy = {0:.5f}".format(acc)) np.set_printoptions(precision=5) print('Test cm = ') print(cm) print('Test precision = {0:.5f}'.format(precision)) print('Test recall = {0:.5f}'.format(recall)) torch.save(model.state_dict(), "./NN_model/model1.pt")