Exemplo n.º 1
0
def CV(X, Y, Model, params, n_splits=3, numlabel=True, shuffle=True):
    X_std = StandardScaler().fit_transform(X)
    kf = KFold(n_splits=n_splits, shuffle=shuffle)
    rec = 0
    acc = 0
    prec = 0
    for train_index_rc, test_index_rc in kf.split(reactantCombination):
        train_index = [
            i for rc in train_index_rc for i in reactantCombination[rc]
        ]
        test_index = [
            i for rc in test_index_rc for i in reactantCombination[rc]
        ]
        X_train, X_test = X_std[train_index], X_std[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        model = Model(**params)
        model.fit(X_train, Y_train)
        pred = model.predict(X_test)
        if numlabel:
            Y_test = numout2boolout(Y_test)
            pred = numout2boolout(pred)
        rec += recall_score(Y_test, pred, average='weighted')
        prec += precision_score(Y_test, pred, average="weighted")
        acc += accuracy_score(Y_test, pred)
    return [rec / n_splits, prec / n_splits, acc / n_splits]
Exemplo n.º 2
0
def use_SelectKBest(n, print_log=True, interpret_tree=False, cv_author=False):
    """in:
    n: int, # of features to be selected
    print_log: bool
    out:
    precision: float, precision_score of svm using choosed features
    selected: array(bool array), shape(n_features), True for features selected else False
    """
    skb = SelectKBest(k=n)
    X_trans = skb.fit_transform(X, Y)
    selected = skb.get_support()
    if print_log:
        print('features selected by SelectKBest:')
        print(X_feature_name[selected])

    model = SVC(kernel=PUK_kernel, class_weight="balanced", C=1)
    model.fit(X_trans, Y)
    pred = model.predict(skb.transform(x))
    precision = precision_score(numout2boolout(y), numout2boolout(pred))
    cm = confusion_matrix(numout2boolout(y), numout2boolout(pred))
    if print_log:
        print('precision={0:.3f}'.format(precision))
        print("confusion matrix:")
        print(cm)
    if cv_author:
        CV_author(X_trans, Y, 3, SVC, {
            "kernel": PUK_kernel,
            "class_weight": "balanced",
            "C": 1
        })

    if interpret_tree:
        tree = reinterpret(X_trans, model, X_trans)
        plt.figure(dpi=160, figsize=(24, 5))
        plot_tree(tree,
                  max_depth=5,
                  feature_names=X_feature_name[selected],
                  rounded=True,
                  filled=True,
                  fontsize=5)
        plt.savefig("./decision_tree_skb.jpg")

    return precision, selected
Exemplo n.º 3
0
def use_RFE(n, print_log=True, interpret_tree=False):
    """in:
    n: int, # of features to be selected
    print_log: bool
    out:
    precision: float, precision_score of svm using choosed features
    selected: array(bool array), shape(n_features), True for features selected else False
    """
    rfe = RFE(estimator=GradientBoostingClassifier(), n_features_to_select=n)
    X_trans = rfe.fit_transform(X, Y)
    selected = rfe.get_support()
    if print_log:
        print('features choosed by RFE:')
        print(X_feature_name[selected])

    model = SVC(kernel=PUK_kernel, class_weight="balanced", C=1)
    model.fit(X_trans, Y)
    pred = model.predict(rfe.transform(x))
    precision = precision_score(numout2boolout(y), numout2boolout(pred))
    cm = confusion_matrix(numout2boolout(y), numout2boolout(pred))
    if print_log:
        print('precision={0:.3f}'.format(precision))
        print("confusion matrix:")
        print(cm)

    if interpret_tree:
        tree = reinterpret(X_trans, model, X_trans)
        plt.figure(dpi=160, figsize=(24, 5))
        plot_tree(tree,
                  max_depth=5,
                  feature_names=X_feature_name[selected],
                  rounded=True,
                  filled=True,
                  fontsize=5)
        plt.show()
        plt.savefig("./decision_tree_rfe.jpg")

    return precision, selected
Exemplo n.º 4
0
        rec += recall_score(Y_test, pred, average='weighted')
        prec += precision_score(Y_test, pred, average="weighted")
        acc += accuracy_score(Y_test, pred)
    return [rec / n_splits, prec / n_splits, acc / n_splits]


for i, kernel in enumerate([PUK_kernel, "rbf", "sigmoid"]):
    result = []
    for j, C in enumerate(np.logspace(-10, 10, num=20)):
        # 分为4类的表现
        result += CV(X_masked,
                     Y,
                     SVC, {
                         "kernel": kernel,
                         "class_weight": "balanced",
                         "C": C,
                         "gamma": "scale",
                     },
                     numlabel=True)
        # 分为2类的表现
        result += CV(X_masked,
                     numout2boolout(Y),
                     SVC, {
                         "kernel": kernel,
                         "class_weight": "balanced",
                         "C": C,
                         "gamma": "scale",
                     },
                     numlabel=False)
    np.save("./out/SVC_{}.npy".format(kernelName[i]))
Exemplo n.º 5
0
pandas 0.25.1
pytorch 1.1.0
scikit-learn 0.21.3
scipy 1.3.1
matplotlib 3.1.1
"""
import numpy as np
from utils import numout2boolout
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# 加载数据
x_feature = np.load("./processedData/X/X_featureName.npy")
X = np.load("./processedData/X/X_train.npy")
x = np.load("./processedData/X/x_test.npy")
Y = numout2boolout(np.load("./processedData/Y/Y_train.npy"))
y = numout2boolout(np.load("./processedData/Y/y_test.npy"))

# 创建随机森林并预测
rf0 = RandomForestClassifier(oob_score=True, random_state=10)
rf0.fit(X, Y)
print(rf0.oob_score_)
print("accuracy:%f" % rf0.oob_score_)
rf0.fit(x, y)
print(rf0.oob_score_)
print("accuracy:%f" % rf0.oob_score_)

# 调参
param_test1 = {"n_estimators": range(1, 101, 5)}
gsearch1 = GridSearchCV(estimator=RandomForestClassifier(),
                        param_grid=param_test1,
Exemplo n.º 6
0
def main():
    learning_rate = 1e-3
    weight_decay = 1e-3
    epoches = 300
    log_interval = 10

    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 加载数据
    Y = numout2boolout(np.load("./processedData/Y/Y_train.npy"))
    y = numout2boolout(np.load("./processedData/Y/y_test.npy"))

    weights_of_lable = np.zeros(2)
    print("# of labels in Y:")
    for i in range(2):
        num_label = len(Y[Y==i])
        print(i, num_label)
        weights_of_lable[i] = 1 / num_label

    weights = [weights_of_lable[int(i)] for i in Y]
    sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights))
    X = np.load("./processedData/X_train_masked.npy")
    x = np.load("./processedData/x_test_masked.npy")
    #X = np.load("./processedData/X/X_train_reduced.npy")
    #x = np.load("./processedData/X/x_test_reduced.npy")

    # 对数据进行特征选择处理
    skb = SelectKBest(k=10)
    X_trans = skb.fit_transform(X, Y)
    x_trans = skb.transform(x)

    normalizer = Normalizer()
    X_trans = normalizer.fit_transform(X_trans)
    x_trans = normalizer.transform(x_trans)

    train_data = TensorDataset(torch.tensor(X_trans, dtype=torch.float), torch.tensor(Y, dtype=torch.long))
    test_data = TensorDataset(torch.tensor(x_trans, dtype=torch.float), torch.tensor(y, dtype=torch.long))
    train_data_loader = DataLoader(train_data, batch_size=16, shuffle=True)#sampler=sampler)
    test_data_loader = DataLoader(test_data, batch_size=8, shuffle=True)

    # 构造模型
    model = simple_NN(np.shape(X)[1], 8, 8)
    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=learning_rate,
        weight_decay=weight_decay)
    #criterion = nn.NLLLoss(reduction='sum')
    criterion = nn.CrossEntropyLoss()

    # 训练
    model.train()
    print("Training...")
    for epoch in range(epoches):
        loss_sum = 0.
        for data, label in train_data_loader:
            #data, label = data.to(DEVICE), label.to(DEVICE)
            # Forward
            optimizer.zero_grad()
            out = model(data)
            loss = criterion(out, label)
            loss_sum += loss
            # Backward
            loss.backward()
            optimizer.step()
        
        if (epoch + 1) % log_interval == 0:
            print("Epoch = {0}, loss = {1:.5f}".format(
                epoch + 1, loss.data.float()))
    
    # 测试
    # 这一段是为了观察是否有收敛
    model.eval()
    print("Predicting...")
    with torch.no_grad():
        labels, pred = np.array([]), np.array([])
        loss_sum = 0.
        print('result in train')
        for data, label in train_data_loader:
            #data, label = data.to(DEVICE), label.to(DEVICE)
            out = model(data)
            category = np.argmax(out, axis=1)#.cpu(), axis=1)
            loss = criterion(out, label)
            loss_sum += loss#.cpu()
            labels = np.append(labels, label)#.cpu())
            pred = np.append(pred, category)

        labels_trans = labels
        pred_trans = pred

        acc = accuracy_score(labels_trans, pred_trans)

        cm = confusion_matrix(labels_trans, pred_trans)
        precision = precision_score(labels_trans, pred_trans, average='micro')
        recall = recall_score(labels_trans, pred_trans, average='micro')

        print("Test loss = {0:.5f}".format(loss_sum))
        print("Test accuracy = {0:.5f}".format(acc))
        np.set_printoptions(precision=5)
        print('Test cm = ')
        print(cm)
        print('Test precision = {0:.5f}'.format(precision))
        print('Test recall = {0:.5f}'.format(recall))

    # 测试集上的结果
    with torch.no_grad():
        labels, pred = np.array([]), np.array([])
        loss_sum = 0.
        print('result in test')
        for data, label in test_data_loader:
            #data, label = data.to(DEVICE), label.to(DEVICE)
            out = model(data)
            category = np.argmax(out, axis=1)#.cpu(), axis=1)
            loss = criterion(out, label)
            loss_sum += loss#.cpu()
            labels = np.append(labels, label)#.cpu())
            pred = np.append(pred, category)

        labels_trans = labels
        pred_trans = pred

        acc = accuracy_score(labels_trans, pred_trans)

        cm = confusion_matrix(labels_trans, pred_trans)
        precision = precision_score(labels_trans, pred_trans, average='micro')
        recall = recall_score(labels_trans, pred_trans, average='micro')

        print("Test loss = {0:.5f}".format(loss_sum))
        print("Test accuracy = {0:.5f}".format(acc))
        np.set_printoptions(precision=5)
        print('Test cm = ')
        print(cm)
        print('Test precision = {0:.5f}'.format(precision))
        print('Test recall = {0:.5f}'.format(recall))
    
    torch.save(model.state_dict(), "./NN_model/model1.pt")