Пример #1
0
def run_classification(data, labels, test_idx, trees, c):
    All_scores = []
    length = len(data[0])
    #print len(data)
    total_AUPR_training = 0
    total_AUPR_testing = 0
    folds_AUPR = []
    folds_AUC = []
    folds_precision = []
    folds_recall = []
    folds_f1 = []
    for fold_data, test_idx_fold in zip(data, test_idx):
        train_idx_fold = []
        for idx in range(length):
            if idx not in test_idx_fold:
                train_idx_fold.append(idx)

        fold_data = np.array(fold_data)
        test_idx_fold = np.array(test_idx_fold)
        train_idx_fold = np.array(train_idx_fold)

        X_train, X_test = fold_data[train_idx_fold, ], fold_data[
            test_idx_fold, ]
        y_train, y_test = np.array(train_idx_fold), np.array(test_idx_fold)

        max_abs_scaler = MaxAbsScaler()
        X_train_maxabs_fit = max_abs_scaler.fit(X_train)

        X_train_maxabs_transform = max_abs_scaler.transform(X_train)

        X_test_maxabs_transform = max_abs_scaler.transform(X_test)
        rf = RandomForestClassifier(n_estimators=trees,
                                    n_jobs=6,
                                    criterion=c,
                                    class_weight="balanced",
                                    random_state=1357)

        rf.fit(X_train_maxabs_transform, y_train)
        try:
            scores_training = rf.decision_function(X_train_maxabs_transform)
            scores_testing = rf.decision_function(X_test_maxabs_transform)
        except:
            scores_training = rf.predict_proba(X_train_maxabs_transform)[:, 1]
            scores_testing = rf.predict_proba(X_test_maxabs_transform)[:, 1]

        y_pred = rf.predict_proba(X_test_maxabs_transform)

        All_scores.append(scores_testing)

        rf_fpr, rf_tpr, rf_thr = roc_curve(y_test, scores_testing)

        auc_val = auc(rf_fpr, rf_tpr)
        print(y_test)

    return All_scores
Пример #2
0
def performance(x_train,
                y_train,
                x_test,
                y_test,
                algorithm,
                n_estimators=None,
                max_features=None,
                kernel=None,
                C=None,
                gamma=None,
                degree=None,
                coef0=None):
    # fit the model
    if algorithm == 'SVM':
        model = train_svm(x_train, y_train, kernel, C, gamma, degree, coef0)
        model.fit(x_train, y_train)
    elif algorithm == 'random-forest':
        model = RandomForestClassifier(n_estimators=int(n_estimators),
                                       max_features=int(max_features))
        model.fit(x_train, y_train)
    else:
        print("Unknown algorithm: %s" % algorithm)

    # predict the test set
    if algorithm == 'SVM':
        predictions = model.decision_function(x_test)
    else:
        predictions = model.predict_proba(x_test)[:, 1]

    return optunity.metrics.roc_auc(y_test, predictions, positive=True)
Пример #3
0
 def train(self):
     """
     训练函数
     :return:
     """
     if self.flag == 'SVM':
         classifier = OneVsRestClassifier(
             SVC(kernel=kernel_func,
                 probability=True,
                 C=1.0,
                 random_state=0,
                 gamma=0.2))
         classifier.fit(self.x_train, self.y_train)
         self.score = classifier.decision_function(self.x_test)
         print('模型已保存到%s' % (model_save_path))
         joblib.dump(classifier, model_save_path)
     if self.flag == 'RF':
         clf = RandomForestClassifier()
         clf.fit(self.x_train, self.y_train)
         print('模型已保存到%s' % (model_save_path))
         joblib.dump(clf, model_save_path)
     if self.flag == 'NB':
         clf = GaussianNB()
         clf.fit(self.x_train, self.y_train)
         print('模型已保存到%s' % (model_save_path))
         joblib.dump(clf, model_save_path)
     if self.flag == 'DT':
         clf = DecisionTreeClassifier()
         clf.fit(self.x_train, self.y_train)
         print('模型已保存到%s' % (model_save_path))
         joblib.dump(clf, model_save_path)
     if self.flag == 'LR':
         clf = LogisticRegression()
         clf.fit(self.x_train, self.y_train)
         self.score = clf.decision_function(self.x_test)
         print('模型已保存到%s' % (model_save_path))
         joblib.dump(clf, model_save_path)
     if self.flag == 'KNN':
         clf = KNeighborsClassifier(n_neighbors=3)
         clf.fit(self.x_train, self.y_train)
         print('模型已保存到%s' % (model_save_path))
         joblib.dump(clf, model_save_path)
Пример #4
0
def random_forest_classifier(data):
    """
    bulid a random forest classifier and test its accuracuy
    :param data: input data frame with features and labels
    :return:
    """

    X = data[['entropy', 'lzw', 'a', 'c', 'g', 't']]  # Features
    y = data['y']  # Labels

    # Split dataset into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)  # 70% training and 30% test

    # Create a Gaussian Classifier
    clf = RandomForestClassifier(n_estimators=100)

    # Train the model using the training sets y_pred=clf.predict(X_test)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    # Model Accuracy
    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))


    # analysis of multiple classification models taken from:
    # https://scikit-learn.org/stable/auto_examples/calibration/plot_compare_calibration.html#sphx-glr-auto-examples-calibration-plot-compare-calibration-py
    # Create classifiers
    lr = LogisticRegression(solver='lbfgs')
    gnb = GaussianNB()
    svc = LinearSVC(C=1.0)
    rfc = RandomForestClassifier(n_estimators=100)

    # #############################################################################
    # Plot calibration plots

    plt.figure(figsize=(10, 10))
    ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
    ax2 = plt.subplot2grid((3, 1), (2, 0))

    ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
    for clf, name in [(lr, 'Logistic'),
                      (gnb, 'Naive Bayes'),
                      (svc, 'Support Vector Classification'),
                      (rfc, 'Random Forest')]:
        clf.fit(X_train, y_train)
        if hasattr(clf, "predict_proba"):
            prob_pos = clf.predict_proba(X_test)[:, 1]
        else:  # use decision function
            prob_pos = clf.decision_function(X_test)
            prob_pos = \
                (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
        fraction_of_positives, mean_predicted_value = \
            calibration_curve(y_test, prob_pos, n_bins=10)

        ax1.plot(mean_predicted_value, fraction_of_positives, "s-",
                 label="%s" % (name,))

        ax2.hist(prob_pos, range=(0, 1), bins=10, label=name,
                 histtype="step", lw=2)

    ax1.set_ylabel("Fraction of positives", fontsize=14)
    ax1.set_ylim([-0.05, 1.05])
    ax1.legend(loc="lower right")
    ax1.set_title('Calibration plots  (reliability curve)', fontsize=14)

    ax2.set_xlabel("Mean predicted value", fontsize=14)
    ax2.set_ylabel("Count", fontsize=14)
    ax2.legend(loc="upper center", ncol=2)

    plt.tight_layout()
    plt.show()
Пример #5
0
#rf = RandomForestClassifier(n_estimators=100, max_features='auto', n_jobs=4)
rf = RandomForestClassifier(n_estimators=100, max_features=15, n_jobs=4, max_depth=8)
#rf = RandomForestClassifier(n_estimators=100, max_features='auto', n_jobs=4, max_depth=5)

#training
st = time.time()
print "training started"
rf.fit( x_train, y_train )
print "training ended"
et = time.time()
tt = et - st
print "Training Time = " + str(tt) + "\n"

#predictions
pred = rf.predict( x_test )
y_score = rf.decision_function(x_test)
out = open('../results/rf_combi_yes.txt','w')

#validation
total = y_test.size
good = 0
bad = 0
for i in range(total):
    a = y_test[i]
    p = pred[i]
    line = str(a) + ',' + str(p) + '\n'
    out.write(line)
    if str(a) == str(p):
        good = good + 1;
    else:
        bad = bad + 1;
Пример #6
0
            c=y_train,
            s=30,
            cmap=plt.cm.Paired)
plt.xlabel(x1_var)
plt.ylabel(x2_var)
# plot the decision function
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()

# create grid to evaluate model
xx = np.linspace(xlim[0], xlim[1], 30)
yy = np.linspace(ylim[0], ylim[1], 30)
YY, XX = np.meshgrid(yy, xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T
Z = clf.decision_function(xy).reshape(XX.shape)

# plot decision boundary and margins
ax.contour(XX,
           YY,
           Z,
           colors=['g', 'r', 'g'],
           levels=[-1, 0, 1],
           alpha=0.5,
           linestyles=['--', '-', '--'])
plt.show()

### explore with the different kernel and C-value
from sklearn.model_selection import StratifiedKFold
import time
from sklearn.svm import SVC
Пример #7
0


test_results = clf.predict(test)
print(np.mean(test_results))

data_to_submit = pd.DataFrame({
    #'id':test_names,
    'prediction':test_results
})

data_to_submit.to_csv('csv_to_submit_unbalanced.csv', index = False)


w_norm = np.linalg.norm(clf.dual_coef_)
dist = clf.decision_function(X_train) / w_norm

closest_50 = []

count = 0

for i in range(len(dist)):

    current = X_train[i,:]

    if count > len(ictal_training_dwt_four_energy)-1:

        continue 

    if y[i] == 0:
cc =  pd.read_csv("creditcard.csv")
cc = cc.sample(frac=0.06, random_state=1)



cc_train = cc.drop('Class', 1)

from sklearn.ensemble import IsolationForest
clf = IsolationForest(n_estimators=1000, max_samples=200)
#Train the model with the data.
y_value = cc['Class']
#print(y_value)
clf.fit(cc_train , y_value)

# The Anomaly scores are calclated for each observation and stored in 'scores_pred'
scores_pred = clf.decision_function(cc_train)

# scores_pred is added to the cc dataframe
cc['scores']= scores_pred
#I oberved an conflict with the name 'class'. Therefore, I have changed the name from class to category
cc = cc.rename(columns={'Class': 'Category'})



# For convinience, divide the dataframe cc based on two labels.
avg_count_0 = cc.loc[cc.Category==0]    #Data frame with normal observation

normal1 = plt.hist(avg_count_0.scores, 50,)
plt.xlabel('Score distribution of 0')
plt.ylabel('Frequency of 0')
plt.title("Distribution of isoforest score for normal observation")
Пример #9
0
def train_basic(dirpath_vector, dirpath_output):
    logger = utils.get_logger()
    x_train = np.genfromtxt(
        dirpath_vector + '/phylum/train.csv', delimiter='\n', dtype=None, encoding=None)
    x_test = np.genfromtxt(dirpath_vector + '/phylum/test.csv',
                           delimiter='\n', dtype=None, encoding=None)
    x_val = np.genfromtxt(dirpath_vector + '/phylum/val.csv',
                          delimiter='\n', dtype=None, encoding=None)
    arr = []
    arr1 = []
    arr2 = []

    for item in x_train[1:]:
        arr.append(ordinal_encoder(string_to_array(item.split(",")[3])))

    for item in x_test[1:]:
        arr1.append(ordinal_encoder(string_to_array(item.split(",")[3])))

    for item in x_val[1:]:
        arr2.append(ordinal_encoder(string_to_array(item.split(",")[3])))

    maxi = 0
    for item in arr:
        if len(item) > maxi:
            maxi = len(item)

    final1 = np.zeros((x_train.shape[0] - 1, maxi, 5))

    count = 0
    for item in arr:
        final1[count][:len(item)] = item
        count += 1

    maxi1 = 0
    for item in arr1:
        if len(item) > maxi1:
            maxi1 = len(item)

    final2 = np.zeros((x_test.shape[0] - 1, maxi1, 5))

    count = 0
    for item in arr1:
        final2[count][:len(item)] = item
        count += 1

    maxi2 = 0
    for item in arr2:
        if len(item) > maxi2:
            maxi2 = len(item)

    final3 = np.zeros((x_val.shape[0] - 1, maxi2, 5))

    count = 0
    for item in arr2:
        final3[count][:len(item)] = item
        count += 1

    hf = h5py.File(dirpath_vector + '/phylum/ordinal.h5', 'w')

    hf.create_dataset('dataset_1', data=final1)
    hf.create_dataset('dataset_2', data=final2)
    hf.create_dataset('dataset_3', data=final3)

    hf.close()

    x_train = np.genfromtxt(
        dirpath_vector + '/class/train.csv', delimiter='\n', dtype=None, encoding=None)
    x_test = np.genfromtxt(dirpath_vector + '/class/test.csv',
                           delimiter='\n', dtype=None, encoding=None)
    x_val = np.genfromtxt(dirpath_vector + '/class/val.csv',
                          delimiter='\n', dtype=None, encoding=None)
    arr = []
    arr1 = []
    arr2 = []

    for item in x_train[1:]:
        arr.append(ordinal_encoder(string_to_array(item.split(",")[3])))

    for item in x_test[1:]:
        arr1.append(ordinal_encoder(string_to_array(item.split(",")[3])))

    for item in x_val[1:]:
        arr2.append(ordinal_encoder(string_to_array(item.split(",")[3])))

    maxi = 0
    for item in arr:
        if len(item) > maxi:
            maxi = len(item)

    final1 = np.zeros((x_train.shape[0] - 1, maxi, 5))

    count = 0
    for item in arr:
        final1[count][:len(item)] = item
        count += 1

    maxi1 = 0
    for item in arr1:
        if len(item) > maxi1:
            maxi1 = len(item)

    final2 = np.zeros((x_test.shape[0] - 1, maxi1, 5))

    count = 0
    for item in arr1:
        final2[count][:len(item)] = item
        count += 1

    maxi2 = 0
    for item in arr2:
        if len(item) > maxi2:
            maxi2 = len(item)

    final3 = np.zeros((x_val.shape[0] - 1, maxi2, 5))

    count = 0
    for item in arr2:
        final3[count][:len(item)] = item
        count += 1

    hf = h5py.File(dirpath_vector + '/class/ordinal.h5', 'w')

    hf.create_dataset('dataset_1', data=final1)
    hf.create_dataset('dataset_2', data=final2)
    hf.create_dataset('dataset_3', data=final3)

    hf.close()

    hf = h5py.File(dirpath_vector + '/phylum/ordinal.h5', 'r')
    n1 = hf.get('dataset_1')
    n2 = hf.get('dataset_2')
    n3 = hf.get('dataset_3')
    X = np.array(n1)
    Y = np.array(n2)
    V = np.array(n3)
    hf.close()
    lab = np.genfromtxt(dirpath_vector + '/phylum/train.csv',
                        delimiter='\n', dtype=None, encoding=None)
    lab1 = np.genfromtxt(dirpath_vector + '/phylum/test.csv',
                         delimiter='\n', dtype=None, encoding=None)
    lab2 = np.genfromtxt(dirpath_vector + '/phylum/val.csv',
                         delimiter='\n', dtype=None, encoding=None)

    labels = []
    i = 0
    for item in lab[1:]:
        if item.split(",")[0][0] == "A":
            labels.append(0)
        elif item.split(",")[0][0] == "F":
            labels.append(1)
        else:
            labels.append(2)
        i += 1

    labels1 = []
    i = 0
    for item in lab1[1:]:
        if item.split(",")[0][0] == "A":
            labels1.append(0)
        elif item.split(",")[0][0] == "F":
            labels1.append(1)
        else:
            labels1.append(2)
        i += 1

    labels2 = []
    i = 0
    for item in lab2[1:]:
        if item.split(",")[0][0] == "A":
            labels2.append(0)
        elif item.split(",")[0][0] == "F":
            labels2.append(1)
        else:
            labels2.append(2)
        i += 1

    label = np.array(labels)
    label1 = np.array(labels1)
    label2 = np.array(labels2)

    clf2 = SVC(kernel='rbf')
    clf = RandomForestClassifier()

    newX = X.reshape(X.shape[0], X.shape[1] * X.shape[2])
    newY = Y.reshape(Y.shape[0], Y.shape[1] * Y.shape[2])
    clf2.fit(newX, label)
    clf.fit(newX, label)

    preds2 = clf2.predict(newX)
    preds = clf.predict(newX)

    preds2_test = clf2.predict(newY)
    preds_test = clf.predict(newY)
    np.save(dirpath_output + '/SVM_phylum_predictions', preds2_test)
    np.save(dirpath_output + '/RF_phylum_predictions', preds_test)

    scores = clf2.decision_function(newY)
    scores2 = clf.predict(newY)

    score = np.amax(scores, axis=1)

    scores_train = clf2.decision_function(newX)
    scores2_train = clf.predict(newX)

    score_train = np.amax(scores_train, axis=1)

    np.save(dirpath_output + '/SVM_phylum_scores', score)
    np.save(dirpath_output + '/RF_phylum_scores', scores2)

    fpr, tpr, thresholds = roc_curve(label, score_train, pos_label=2)
    fpr2, tpr2, thresholds2 = roc_curve(label, scores2_train, pos_label=2)

    match2 = 0
    for i in range(preds2.shape[0]):
        if preds2[i] == label[i]:
            match2 += 1
    accuracy2 = float(match2) / preds2.shape[0]
    p, r, f1, s = precision_recall_fscore_support(
        label, preds2, average='weighted')

    match = 0
    for i in range(preds.shape[0]):
        if preds[i] == label[i]:
            match += 1
    accuracy = float(match) / preds.shape[0]
    p2, r2, f12, s = precision_recall_fscore_support(
        label, preds, average='weighted')

    C = confusion_matrix(label, preds2)

    logger.info('Train Accuracy, precision, recall and F1 Score for SVM model for phylum level is {:.3f}, {:.3f}, {:.3f}, {:.3f}'.format(
        accuracy2, p, r, f1))
    logger.info('Train Accuracy, precision, recall and F1 Score for Random Forest model for phylum level is {:.3f}, {:.3f}, {:.3f}, {:.3f}'.format(
        accuracy, p2, r2, f12))

    hf = h5py.File(dirpath_vector + '/class/ordinal.h5', 'r')
    n1 = hf.get('dataset_1')
    n2 = hf.get('dataset_2')
    n3 = hf.get('dataset_3')
    X = np.array(n1)
    Y = np.array(n2)
    V = np.array(n3)
    hf.close()

    lab = np.genfromtxt(dirpath_vector + '/class/train.csv',
                        delimiter='\n', dtype=None, encoding=None)
    lab1 = np.genfromtxt(dirpath_vector + '/class/test.csv',
                         delimiter='\n', dtype=None, encoding=None)
    lab2 = np.genfromtxt(dirpath_vector + '/class/val.csv',
                         delimiter='\n', dtype=None, encoding=None)

    labels = []
    i = 0
    for item in lab[1:]:
        labels.append(int(item.split(",")[2]))
        i += 1

    labels1 = []
    i = 0
    for item in lab1[1:]:
        labels1.append(int(item.split(",")[2]))
        i += 1

    labels2 = []
    i = 0
    for item in lab2[1:]:
        labels2.append(int(item.split(",")[2]))
        i += 1

    label = np.array(labels)
    label1 = np.array(labels1)
    label2 = np.array(labels2)

    clf2 = RandomForestClassifier()
    clf = SVC(kernel='rbf')

    newX = X.reshape(X.shape[0], X.shape[1] * X.shape[2])
    newY = Y.reshape(Y.shape[0], Y.shape[1] * Y.shape[2])
    clf2.fit(newX, label)
    clf.fit(newX, label)
    preds2 = clf2.predict(newX)
    preds = clf.predict(newX)
    scores = clf2.predict(newY)
    scores1 = clf.decision_function(newY)

    preds2_test = clf2.predict(newY)
    preds_test = clf.predict(newY)

    np.save(dirpath_output + '/SVM_class_predictions', preds2_test)
    np.save(dirpath_output + '/RF_class_predictions', preds_test)

    score = np.amax(scores1, axis=1)

    scores_train = clf.decision_function(newX)
    scores2_train = clf2.predict(newX)

    score_train = np.amax(scores_train, axis=1)

    np.save(dirpath_output + '/SVM_class_scores', score)
    np.save(dirpath_output + '/RF_class_scores', scores)

    fpr, tpr, thresholds = roc_curve(label, scores_train, pos_label=2)
    fpr2, tpr2, thresholds2 = roc_curve(label, score_train, pos_label=2)

    match2 = 0
    for i in range(preds2.shape[0]):
        if preds2[i] == label[i]:
            match2 += 1
    accuracy2 = float(match2) / preds2.shape[0]
    p, r, f1, s = precision_recall_fscore_support(
        label, preds2, average='weighted')
    C = confusion_matrix(label, preds2)

    match = 0
    for i in range(preds.shape[0]):
        if preds[i] == label[i]:
            match += 1
    accuracy = float(match) / preds.shape[0]
    p2, r2, f12, s = precision_recall_fscore_support(
        label, preds, average='weighted')

    logger.info('Train Accuracy, precision, recall and F1 Score for SVM model for phylum level is {:.3f}, {:.3f}, {:.3f}, {:.3f}'.format(
        accuracy, p2, r2, f12))
    logger.info('Train Accuracy, precision, recall and F1 Score for Random Forest model for phylum level is {:.3f}, {:.3f}, {:.3f}, {:.3f}'.format(
        accuracy2, p, r, f1))
Пример #10
0
def process_target_t1(label_target):
    if submit:
        Y_t1 = train_labels[label_target].iloc[:]
        Y_val_t1 = train_labels[label_target].iloc[train_size:]
    else:
        Y_t1 = train_labels[label_target].iloc[0:train_size - 1]
        Y_val_t1 = train_labels[label_target].iloc[train_size:]

    if features_selection:
        usefulness_column = stored_usefulness_matrix_t1[
            label_target].sort_values(ascending=False)
        useful_features_mask = np.array(usefulness_column) >= threshold
        useful_features = [
            feature for feature, mask in zip(usefulness_column.index,
                                             useful_features_mask) if mask
        ]
        useful_features_augmented = sum(
            [[test, 'dummy_' + test] for test in useful_features if test in tests], []) \
                                    + [feature for feature in useful_features if feature in vital_signs + diff_features] \
            # + sum([sum(

        #     [[feature + suffix] for feature in useful_features if feature in vital_signs],
        #     []) for suffix in diff_features_suffixes], [])
        X_t1_useful = X_t1[list(
            set(useful_features_augmented) & set(X_t1.columns))]
        X_val_t1_useful = X_val_t1[list(
            set(useful_features_augmented) & set(X_t1.columns))]
        X_test_t1_useful = X_test_t1[list(
            set(useful_features_augmented) & set(X_t1.columns))]
    else:
        X_t1_useful = X_t1
        X_val_t1_useful = X_val_t1
        X_test_t1_useful = X_test_t1

    # fit

    if classifier == 'linear' or (classifier == 'kernel'
                                  and best_kernels.at[label_target,
                                                      'kernel'] == 'poly1'):
        clf = svm.LinearSVC(C=1e-3,
                            tol=1e-2,
                            class_weight='balanced',
                            verbose=0)
    elif classifier == 'kernel':
        kernel_dict = {
            'poly2': ('poly', 2),
            'poly3': ('poly', 3),
            'rbf': ('rbf', 0)
        }
        kernel, degree = kernel_dict[best_kernels.at[label_target, 'kernel']]
        C = best_kernels.at[label_target, 'C']
        clf = svm.SVC(C=C,
                      kernel=kernel,
                      degree=degree,
                      tol=1e-4,
                      class_weight='balanced',
                      verbose=0)
    elif classifier == 'RF':
        clf = RandomForestClassifier(n_estimators=2500,
                                     class_weight="balanced_subsample")
    else:
        raise ValueError("choose between 'linear', 'classifier' and 'RF' ")

    # fit
    clf.fit(X_t1_useful, Y_t1)

    # predict and save into dataframe
    if classifier == 'linear' or classifier == 'kernel':
        Y_temp = np.array([clf.decision_function(X_val_t1_useful)])
        Y_val_pred = (1 / (1 + np.exp(-Y_temp))).flatten()
        Y_temp = np.array([clf.decision_function(X_test_t1_useful)])
        Y_test_pred = (1 / (1 + np.exp(-Y_temp))).flatten()
    elif classifier == 'RF':
        Y_val_pred = (1 - clf.predict_proba(X_val_t1_useful))[:, 0]
        Y_test_pred = (1 - clf.predict_proba(X_test_t1_useful))[:, 0]

    lock.acquire()
    Y_test_tot.loc[:, label_target] = Y_test_pred
    score = np.mean([skmetrics.roc_auc_score(Y_val_t1, Y_val_pred)])
    scores_t1[label_target] = score
    lock.release()
    return score
Пример #11
0
def test_basic(dirpath_vector, dirpath_output, verbose=True):
    logger = utils.get_logger()
    x_train = np.genfromtxt(dirpath_vector + '/phylum/train.csv',
                            delimiter='\n',
                            dtype=None,
                            encoding=None)
    x_test = np.genfromtxt(dirpath_vector + '/phylum/test.csv',
                           delimiter='\n',
                           dtype=None,
                           encoding=None)
    x_val = np.genfromtxt(dirpath_vector + '/phylum/val.csv',
                          delimiter='\n',
                          dtype=None,
                          encoding=None)
    arr = []
    arr1 = []
    arr2 = []

    for item in x_train[1:]:
        arr.append(ordinal_encoder(string_to_array(item.split(",")[3])))

    for item in x_test[1:]:
        arr1.append(ordinal_encoder(string_to_array(item.split(",")[3])))

    for item in x_val[1:]:
        arr2.append(ordinal_encoder(string_to_array(item.split(",")[3])))

    maxi = 0
    for item in arr:
        if len(item) > maxi:
            maxi = len(item)

    final1 = np.zeros((x_train.shape[0] - 1, maxi))

    count = 0
    for item in arr:
        final1[count][:len(item)] = item
        count += 1

    maxi1 = 0
    for item in arr1:
        if len(item) > maxi1:
            maxi1 = len(item)

    final2 = np.zeros((x_test.shape[0] - 1, maxi1))

    count = 0
    for item in arr1:
        final2[count][:len(item)] = item
        count += 1

    maxi2 = 0
    for item in arr2:
        if len(item) > maxi2:
            maxi2 = len(item)

    final3 = np.zeros((x_val.shape[0] - 1, maxi2))

    count = 0
    for item in arr2:
        final3[count][:len(item)] = item
        count += 1

    hf = h5py.File(dirpath_vector + '/phylum/ordinal.h5', 'w')

    hf.create_dataset('dataset_1', data=final1)
    hf.create_dataset('dataset_2', data=final2)
    hf.create_dataset('dataset_3', data=final3)

    hf.close()

    x_train = np.genfromtxt(dirpath_vector + '/class/train.csv',
                            delimiter='\n',
                            dtype=None,
                            encoding=None)
    x_test = np.genfromtxt(dirpath_vector + '/class/test.csv',
                           delimiter='\n',
                           dtype=None,
                           encoding=None)
    x_val = np.genfromtxt(dirpath_vector + '/class/val.csv',
                          delimiter='\n',
                          dtype=None,
                          encoding=None)
    arr = []
    arr1 = []
    arr2 = []

    for item in x_train[1:]:
        arr.append(ordinal_encoder(string_to_array(item.split(",")[3])))

    for item in x_test[1:]:
        arr1.append(ordinal_encoder(string_to_array(item.split(",")[3])))

    for item in x_val[1:]:
        arr2.append(ordinal_encoder(string_to_array(item.split(",")[3])))

    maxi = 0
    for item in arr:
        if len(item) > maxi:
            maxi = len(item)

    final1 = np.zeros((x_train.shape[0] - 1, maxi))

    count = 0
    for item in arr:
        final1[count][:len(item)] = item
        count += 1

    maxi1 = 0
    for item in arr1:
        if len(item) > maxi1:
            maxi1 = len(item)

    final2 = np.zeros((x_test.shape[0] - 1, maxi1))

    count = 0
    for item in arr1:
        final2[count][:len(item)] = item
        count += 1

    maxi2 = 0
    for item in arr2:
        if len(item) > maxi2:
            maxi2 = len(item)

    final3 = np.zeros((x_val.shape[0] - 1, maxi2))

    count = 0
    for item in arr2:
        final3[count][:len(item)] = item
        count += 1

    hf = h5py.File(dirpath_vector + '/class/ordinal.h5', 'w')

    hf.create_dataset('dataset_1', data=final1)
    hf.create_dataset('dataset_2', data=final2)
    hf.create_dataset('dataset_3', data=final3)

    hf.close()

    hf = h5py.File(dirpath_vector + '/phylum/ordinal.h5', 'r')
    n1 = hf.get('dataset_1')
    n2 = hf.get('dataset_2')
    n3 = hf.get('dataset_3')
    X = np.array(n1)
    Y = np.array(n2)
    V = np.array(n3)
    hf.close()
    lab = np.genfromtxt(dirpath_vector + '/phylum/train.csv',
                        delimiter='\n',
                        dtype=None,
                        encoding=None)
    lab1 = np.genfromtxt(dirpath_vector + '/phylum/test.csv',
                         delimiter='\n',
                         dtype=None,
                         encoding=None)
    lab2 = np.genfromtxt(dirpath_vector + '/phylum/val.csv',
                         delimiter='\n',
                         dtype=None,
                         encoding=None)

    labels = []
    i = 0
    for item in lab[1:]:
        if item.split(",")[0][0] == "A":
            labels.append(0)
        elif item.split(",")[0][0] == "F":
            labels.append(1)
        else:
            labels.append(2)
        i += 1

    labels1 = []
    i = 0
    for item in lab1[1:]:
        if item.split(",")[0][0] == "A":
            labels1.append(0)
        elif item.split(",")[0][0] == "F":
            labels1.append(1)
        else:
            labels1.append(2)
        i += 1

    labels2 = []
    i = 0
    for item in lab2[1:]:
        if item.split(",")[0][0] == "A":
            labels2.append(0)
        elif item.split(",")[0][0] == "F":
            labels2.append(1)
        else:
            labels2.append(2)
        i += 1

    label = np.array(labels)
    label1 = np.array(labels1)
    label2 = np.array(labels2)

    clf2 = SVC(kernel='rbf')
    clf = RandomForestClassifier()

    clf2.fit(X, label)
    clf.fit(X, label)

    preds2 = clf2.predict(Y)
    preds = clf.predict(Y)

    scores = clf2.decision_function(Y)
    scores2 = clf.predict(Y)

    score = np.amax(scores, axis=1)

    fpr, tpr, thresholds = roc_curve(label1, score, pos_label=2)
    fpr2, tpr2, thresholds2 = roc_curve(label1, scores2, pos_label=2)

    roc_auc = auc(fpr, tpr)
    roc_auc2 = auc(fpr2, tpr2)
    plt.plot(fpr2, tpr2, lw=1, label='(AUC = %0.2f)' % (roc_auc2))
    plt.plot(fpr, tpr, lw=1, label='(AUC = %0.2f)' % (roc_auc))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.title("ROC curve SVM vs RF - Phylum Level")
    plt.legend(("SVM", "RandomForest"))
    plt.xlabel("fpr")
    plt.ylabel("tpr")
    plt.savefig(dirpath_output + "/" + "ROC_Phylum")

    match2 = 0
    for i in range(preds2.shape[0]):
        if preds2[i] == label1[i]:
            match2 += 1
    accuracy2 = float(match2) / preds2.shape[0]
    p, r, f1, s = precision_recall_fscore_support(label1,
                                                  preds2,
                                                  average='weighted')

    match = 0
    for i in range(preds.shape[0]):
        if preds[i] == label1[i]:
            match += 1
    accuracy = float(match) / preds.shape[0]
    p2, r2, f12, s = precision_recall_fscore_support(label1,
                                                     preds,
                                                     average='weighted')

    C = confusion_matrix(label1, preds2)

    logger.info(
        'Test Accuracy, precision, recall and F1 Score for SVM model for phylum level is {:.3f}, {:.3f}, {:.3f}, {:.3f}'
        .format(accuracy2, p, r, f1))
    logger.info(
        'Test Accuracy, precision, recall and F1 Score for Random Forest model for phylum level is {:.3f}, {:.3f}, {:.3f}, {:.3f}'
        .format(accuracy, p2, r2, f12))

    hf = h5py.File(dirpath_vector + '/class/ordinal.h5', 'r')
    n1 = hf.get('dataset_1')
    n2 = hf.get('dataset_2')
    n3 = hf.get('dataset_3')
    X = np.array(n1)
    Y = np.array(n2)
    V = np.array(n3)
    hf.close()

    lab = np.genfromtxt(dirpath_vector + '/class/train.csv',
                        delimiter='\n',
                        dtype=None,
                        encoding=None)
    lab1 = np.genfromtxt(dirpath_vector + '/class/test.csv',
                         delimiter='\n',
                         dtype=None,
                         encoding=None)
    lab2 = np.genfromtxt(dirpath_vector + '/class/val.csv',
                         delimiter='\n',
                         dtype=None,
                         encoding=None)

    labels = []
    i = 0
    for item in lab[1:]:
        labels.append(int(item.split(",")[2]))
        i += 1

    labels1 = []
    i = 0
    for item in lab1[1:]:
        labels1.append(int(item.split(",")[2]))
        i += 1

    labels2 = []
    i = 0
    for item in lab2[1:]:
        labels2.append(int(item.split(",")[2]))
        i += 1

    label = np.array(labels)
    label1 = np.array(labels1)
    label2 = np.array(labels2)

    clf2 = RandomForestClassifier()
    clf = SVC(kernel='rbf')

    clf2.fit(X, label)
    clf.fit(X, label)
    preds2 = clf2.predict(Y)
    preds = clf.predict(Y)
    scores = clf2.predict(Y)
    scores1 = clf.decision_function(Y)

    score = np.amax(scores1, axis=1)

    fpr, tpr, thresholds = roc_curve(label1, scores, pos_label=2)
    fpr2, tpr2, thresholds2 = roc_curve(label1, score, pos_label=2)

    roc_auc = auc(fpr, tpr)
    roc_auc2 = auc(fpr2, tpr2)
    plt.figure()
    plt.plot(fpr2, tpr2, lw=1, label='(AUC = %0.2f)' % (roc_auc2))
    plt.plot(fpr, tpr, lw=1, label='(AUC = %0.2f)' % (roc_auc))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.title("ROC curve SVM vs RF - Class Level")
    plt.legend(("SVM", "RandomForest"))
    plt.xlabel("fpr")
    plt.ylabel("tpr")
    if not os.path.exists(dirpath_output):
        os.makedirs(dirpath_output)
    plt.savefig(dirpath_output + "/" + "ROC_Class")

    match2 = 0
    for i in range(preds2.shape[0]):
        if preds2[i] == label1[i]:
            match2 += 1
    accuracy2 = float(match2) / preds2.shape[0]
    p, r, f1, s = precision_recall_fscore_support(label1,
                                                  preds2,
                                                  average='weighted')
    C = confusion_matrix(label1, preds2)

    match = 0
    for i in range(preds.shape[0]):
        if preds[i] == label1[i]:
            match += 1
    accuracy = float(match) / preds.shape[0]
    p2, r2, f12, s = precision_recall_fscore_support(label1,
                                                     preds,
                                                     average='weighted')

    logger.info(
        'Test Accuracy, precision, recall and F1 Score for SVM model for class level is {:.3f}, {:.3f}, {:.3f}, {:.3f}'
        .format(accuracy, p2, r2, f12))
    logger.info(
        'Test Accuracy, precision, recall and F1 Score for Random Forest model for class level is {:.3f}, {:.3f}, {:.3f}, {:.3f}'
        .format(accuracy2, p, r, f1))
Пример #12
0
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())
    i += 1

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)

        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, m_max]x[y_min, y_max].
        if hasattr(clf, "decision_function"):
            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        else:
            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

        # Plot also the training points
        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
        # and testing points
        ax.scatter(X_test[:, 0],
                   X_test[:, 1],
                   c=y_test,
                   cmap=cm_bright,
                   alpha=0.6)
Пример #13
0
clf = svm.SVC(kernel='linear', C=1000)
clf.fit(X, y)

plt.scatter(X[:, 0], X[:, 1], c=y, s=30, cmap=plt.cm.Paired)

# plot the decision function
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()

# create grid to evaluate model
xx = np.linspace(xlim[0], xlim[1], 30)
yy = np.linspace(ylim[0], ylim[1], 30)
YY, XX = np.meshgrid(yy, xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T
Z = clf.decision_function(xy).reshape(XX.shape)

# plot decision boundary and margins
ax.contour(XX,
           YY,
           Z,
           colors='k',
           levels=[-1, 0, 1],
           alpha=0.5,
           linestyles=['--', '-', '--'])
# plot support vectors
ax.scatter(clf.support_vectors_[:, 0],
           clf.support_vectors_[:, 1],
           s=100,
           linewidth=1,
           facecolors='none',
Пример #14
0
def perform_experiment(train_fs,
                       test_fs,
                       avstats_in,
                       binarize,
                       classifier='RF',
                       subsample=False):
    print('Performing experiment')
    res = []
    key_dates = []
    avstats = collections.defaultdict(int)
    for w, (f_tr, f_te) in enumerate(zip(train_fs, test_fs), start=1):
        # Load test dates
        dates = numpy.array(load_dates(f_te))
        week_s, week_e = dates.min(), dates.max()
        key_dates.append(week_s)
        print('\nPeriod {} [{} - {}]'.format(w, week_s, week_e))

        # Load training data
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            X_tr, y_tr = datasets.load_svmlight_file(f_tr)
        print(X_tr.shape)
        if subsample:
            new_size = int(round(X_tr.shape[0] * subsample))
            subsam = numpy.random.choice(X_tr.shape[0], new_size)
            X_tr = X_tr[subsam, :]
            y_tr = y_tr[subsam]
        if binarize:
            X_tr.data = numpy.ones_like(X_tr.data)
        X_tr = X_tr.toarray()

        # Train classifier
        if classifier == 'RF':
            clf = RFC(n_estimators=200, n_jobs=1 if subsample else -1)
        elif classifier == 'SVM':
            clf = SVC(kernel='rbf', gamma=0.0025, C=12)
        sample_weight = None
        print('Training set size: {}'.format(X_tr.shape))
        clf.fit(X_tr, y_tr, sample_weight=sample_weight)
        tr_n_feats = X_tr.shape[1]
        del X_tr

        # Load and classify test data
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            X_te, y_te = datasets.load_svmlight_file(f_te,
                                                     n_features=tr_n_feats)
        if binarize:
            X_te.data = numpy.ones_like(X_te.data)
        X_te = X_te.toarray()
        print('Test set size: {}'.format(X_te.shape))
        y_pr = clf.predict(X_te)
        if classifier == 'RF':
            y_val = clf.predict_proba(X_te)[:, 1]
        elif classifier == 'SVM':
            y_val = clf.decision_function(X_te)
        del X_te

        # Evaluate experimental results
        res.append(experiment_stats(y_tr, y_te, y_pr, y_val))

        # Load file IDs
        fileIDs = numpy.array(load_SHA256_sums(f_te))[numpy.where(y_te > 0.5)]

        # Update AV detection results
        for fid in fileIDs:
            avstats['Total'] += 1
            if fid in avstats_in:
                for av, det in avstats_in[fid]['report'].iteritems():
                    if det:
                        avstats[av] += 1
        del fileIDs
        avstats['Hidost'] += numpy.logical_and(y_te == y_pr, y_te > 0.5).sum()
    res = numpy.concatenate(res)
    return res, key_dates, avstats
Пример #15
0
def main():
    X_train, X_test, y_train, y_test, X_all, y_all = getDatasets()
    print 'Train:', len(X_train), 'Test:', len(X_test)
    print "Number of important citation in Training and Testing respectively: ", int(sum(y_train)), int(sum(y_test))
    
#     print '*****SVM****'
#     clf = svm.SVC(C=0.75, kernel='rbf', gamma='auto', probability=True, class_weight={1: 6})
# #     clf = svm.SVC(C=0.75, kernel='rbf', gamma='auto', probability=True, class_weight={1: 13})
#     clf.fit(X_train, y_train)
#     y_pred = clf.predict(X_test)
#     print "Predicted Important citations: ", int(sum(y_pred))
#     tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
#     print "TP,TN,FP,FN", tp, tn, fp, fn
# #     print 'Accuracy:', accuracy_score(y_test, y_pred, normalize=True), 'Recall:', recall_score(y_test, y_pred, average='micro'), 'F1:', f1_score(y_test, y_pred, average='micro')
#     precision = 1.0 * tp / (tp + fp)
#     recall = 1.0 * tp / (tp + fn)
#     accuracy = 1.0 * (tp + tn) / len(X_test)
#     f1 = 2.0 * (precision * recall) / (precision + recall)
#     print 'Accuracy:', accuracy, 'Recall:', recall , 'Precision:', precision, 'F1:', f1 
#     y_pred_svm = clf.predict_proba(X_test)[:, 1]
#     fpr_svm, tpr_svm, _ = roc_curve(y_test, y_pred_svm)
#     precision_svm, recall_svm, _ = precision_recall_curve(y_test, y_pred_svm)
#     scores = cross_val_score(clf, X_all, y_all, cv=3)
#     print scores
    
    print '****Random Forest****'
    clf = RandomForestClassifier(n_jobs=-1, random_state=0, class_weight={1: 6})
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print "Predicted Important citations: ", int(sum(y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print "TP,TN,FP,FN", tp, tn, fp, fn
    precision = 1.0 * tp / (tp + fp)
    recall = 1.0 * tp / (tp + fn)
    accuracy = 1.0 * (tp + tn) / len(X_test)
    f1 = 2.0 * (precision * recall) / (precision + recall)
    print 'Accuracy:', accuracy, 'Recall:', recall , 'Precision:', precision, 'F1:', f1 
    y_pred_rf = clf.predict_proba(X_test)[:, 1]
    fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)
    precision_rf, recall_rf, _ = precision_recall_curve(y_test, y_pred_rf)
#     scores = cross_val_score(clf, X_all, y_all, cv=3)
#     print scores
    
    print '****Logistic Regression****'
#     clf = linear_model.LogisticRegression(C=1e5, class_weight={1: 13})
    clf = linear_model.LogisticRegression(C=0.75, class_weight={1: 6})
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print "Predicted Important citations: ", int(sum(y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print "TP,TN,FP,FN", tp, tn, fp, fn
    precision = 1.0 * tp / (tp + fp)
    recall = 1.0 * tp / (tp + fn)
    accuracy = 1.0 * (tp + tn) / len(X_test)
    f1 = 2.0 * (precision * recall) / (precision + recall)
    print 'Accuracy:', accuracy, 'Recall:', recall , 'Precision:', precision, 'F1:', f1 
    y_pred_lr = clf.decision_function(X_test)
    fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred_lr)
    precision_lr, recall_lr, _ = precision_recall_curve(y_test, y_pred_lr)
#     scores = cross_val_score(clf, X_all, y_all, cv=3)
#     print scores
    
    print '****naive bayes****'
    clf = GaussianNB()
    clf.fit(X_train, y_train)
#     print clf.class_prior_
    y_pred = clf.predict(X_test)
    print "Predicted Important citations: ", int(sum(y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print "TP,TN,FP,FN", tp, tn, fp, fn
    precision = 1.0 * tp / (tp + fp)
    recall = 1.0 * tp / (tp + fn)
    accuracy = 1.0 * (tp + tn) / len(X_test)
    f1 = 2.0 * (precision * recall) / (precision + recall)
    print 'Accuracy:', accuracy, 'Recall:', recall , 'Precision:', precision, 'F1:', f1 
    y_pred_nb = clf.predict_proba(X_test)[:, 1]
    fpr_nb, tpr_nb, _ = roc_curve(y_test, y_pred_nb)
    precision_nb, recall_nb, _ = precision_recall_curve(y_test, y_pred_nb)
#     scores = cross_val_score(clf, X_all, y_all, cv=3)
#     print scores
    
    plt.figure(1)
    plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
    plt.plot(fpr_rf, tpr_rf, label='Random Forest')
#     plt.plot(fpr_svm, tpr_svm, label='SVM')
    plt.plot(fpr_lr, tpr_lr, label='Logistic Regression')
    plt.plot(fpr_nb, tpr_nb, label='Naive Bayes')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
#     plt.show()
    
    plt.figure(2)
#     plt.plot(recall_svm, precision_svm, label='SVM')
    plt.plot(recall_rf, precision_rf, label='Random Forest')
    plt.plot(recall_lr, precision_lr, label='Logistic Regression')
    plt.plot(recall_nb, precision_nb, label='Naive Bayes')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall curve')
    plt.legend(loc='best')
        pred_vals = np.array([])
        pred_prob = np.array([])
        for drug in drug_names:
            mask = screen_drugs == drug
            x_train = x[np.invert(mask), :]
            y_train = y[np.invert(mask)]
            x_test = x[mask, :]
            y_test = y[mask]

            _ = model.fit(x_train, y_train)

            true_vals = np.append(true_vals, y_test)
            pred_vals = np.append(pred_vals, model.predict(x_test))
            if isinstance(model, LinearSVC):
                pred_prob = np.append(pred_prob,
                                      model.decision_function(x_test))
            else:
                pred_prob = np.append(pred_prob,
                                      model.predict_proba(x_test)[:, 1])

        bm_model += [evaluate(pred_vals, true_vals, pred_prob)]

    benchmarks_models += [bm_model]

for bm in benchmarks_models:
    print "\t".join([str(round(temp, 2)) for temp in np.mean(bm, axis=0)])
    print "\t".join([
        "{:.0e}".format(
            scipy.stats.ttest_ind(
                np.array(benchmark_10_cross)[:, i],
                np.array(bm)[:, i]).pvalue) for i in range(6)
fi = pd.DataFrame({'feature': list(x_test.columns),
                   'importance': model.feature_importances_}).\
                    sort_values('importance', ascending = False)

# Display and Save
print(fi.head())
fi.to_csv("keep_4_2016_RF_Feat_imp.csv", index = False)
x.to_csv("keep_4_2016_RF_pred.csv", index = False)

#%%
"""
SVM
Get precision - recall score 
"""
from sklearn.metrics import average_precision_score
y_score = model.decision_function(x_test)
average_precision = average_precision_score(y_test, y_score)
print("Average Precision")
print(average_precision)
print("Confusion Matrix")
print(cm)


#%%

# list_in_order = ['prediction','y_test', 'y_score']
# for ind in range(len(list(x_test)) - 3):
#     list_in_order.append(list(x_test)[ind])
  
# output = pd.DataFrame(x_test, columns = list_in_order)
Пример #18
0
def perform_experiment(train_fs, test_fs, avstats_in, binarize,
                       classifier='RF', subsample=False):
    print('Performing experiment')
    res = []
    key_dates = []
    avstats = collections.defaultdict(int)
    for w, (f_tr, f_te) in enumerate(zip(train_fs, test_fs), start=1):
        # Load test dates
        dates = numpy.array(load_dates(f_te))
        week_s, week_e = dates.min(), dates.max()
        key_dates.append(week_s)
        print('\nPeriod {} [{} - {}]'.format(w, week_s, week_e))

        # Load training data
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            X_tr, y_tr = datasets.load_svmlight_file(f_tr)
        print(X_tr.shape)
        if subsample:
            new_size = int(round(X_tr.shape[0] * subsample))
            subsam = numpy.random.choice(X_tr.shape[0], new_size)
            X_tr = X_tr[subsam, :]
            y_tr = y_tr[subsam]
        if binarize:
            X_tr.data = numpy.ones_like(X_tr.data)
        X_tr = X_tr.toarray()

        # Train classifier
        if classifier == 'RF':
            clf = RFC(n_estimators=200, n_jobs=1 if subsample else -1)
        elif classifier == 'SVM':
            clf = SVC(kernel='rbf', gamma=0.0025, C=12)
        sample_weight = None
        print('Training set size: {}'.format(X_tr.shape))
        clf.fit(X_tr, y_tr, sample_weight=sample_weight)
        tr_n_feats = X_tr.shape[1]
        del X_tr

        # Load and classify test data
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            X_te, y_te = datasets.load_svmlight_file(f_te, n_features=tr_n_feats)
        if binarize:
            X_te.data = numpy.ones_like(X_te.data)
        X_te = X_te.toarray()
        print('Test set size: {}'.format(X_te.shape))
        y_pr = clf.predict(X_te)
        if classifier == 'RF':
            y_val = clf.predict_proba(X_te)[:, 1]
        elif classifier == 'SVM':
            y_val = clf.decision_function(X_te)
        del X_te

        # Evaluate experimental results
        res.append(experiment_stats(y_tr, y_te, y_pr, y_val))

        # Load file IDs
        fileIDs = numpy.array(
            load_SHA256_sums(f_te))[numpy.where(y_te > 0.5)]

        # Update AV detection results
        for fid in fileIDs:
            avstats['Total'] += 1
            if fid in avstats_in:
                for av, det in avstats_in[fid]['report'].iteritems():
                    if det:
                        avstats[av] += 1
        del fileIDs
        avstats['Hidost'] += numpy.logical_and(y_te == y_pr, y_te > 0.5).sum()
    res = numpy.concatenate(res)
    return res, key_dates, avstats
Пример #19
0
def main():
    train_size = 0.8


    X_train, X_valid, y_train, y_valid, scaler = load_train_data(train_size=train_size, scale_it=True, square_root_it=True)
    X_test, X_test_ids = load_test_data(scaler=scaler, square_root_it=True)

    full_X_train, _, full_y_train, _, full_scaler = load_train_data(full_train=True, scale_it=True, square_root_it=True)
    X_test_for_full, X_test_ids = load_test_data(scaler=full_scaler, square_root_it=True)


    # logistic
    # loss = ~0.6...
    # clf = LogisticRegression()
    # clf.fit(X_train, y_train)
    # clf_isotonic = CalibratedClassifierCV(clf, cv=5, method='isotonic')
    # clf_isotonic.fit(X_train, y_train)
    # y_valid_predicted = clf_isotonic.predict_proba(X_valid)
    # log_loss_mc(y_valid, y_valid_predicted)
    

    # gnb
    # loss = ~1.6...
    # clf = GaussianNB()
    # clf.fit(X_train, y_train)
    # clf_isotonic = CalibratedClassifierCV(clf, cv=5, method='isotonic')
    # clf_isotonic.fit(X_train, y_train)
    # y_valid_predicted = clf_isotonic.predict_proba(X_valid)
    # log_loss_mc(y_valid, y_valid_predicted)
    

    # rf
    # when n_estimators=100, without calibration, loss = ~0.6
    # when n_estimators=100, with calibration, loss = ~0.483
    clf = RandomForestClassifier(n_estimators=600, n_jobs=-1, verbose=1)
    clf.fit(X_train, y_train)
    clf_isotonic = CalibratedClassifierCV(clf, cv=5, method='isotonic')
    clf_isotonic.fit(X_train, y_train)
    y_valid_predicted = clf_isotonic.predict_proba(X_valid)
    log_loss_mc(y_valid, y_valid_predicted)
    

    # linear svc
    clf = LinearSVC(C=1.0, verbose=2)
    clf.fit(X_train, y_train)
    prob_pos = clf.decision_function(X_valid)
    prob_pos = \
            (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
    y_valid_predicted = prob_pos
    log_loss_mc(y_valid, y_valid_predicted)
    clf_isotonic = CalibratedClassifierCV(clf, cv=5, method='isotonic')
    clf_isotonic.fit(X_train, y_train)
    y_valid_predicted = clf_isotonic.predict_proba(X_valid)
    log_loss_mc(y_valid, y_valid_predicted)


    # well, non-linear svc
    clf = SVC(C=1.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, cache_size=2000, class_weight=None, verbose=True, max_iter=-1)
    clf.fit(X_train, y_train)
    prob_pos = clf.decision_function(X_valid)
    prob_pos = \
            (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
    y_valid_predicted = prob_pos
    log_loss_mc(y_valid, y_valid_predicted)
    # http://stackoverflow.com/questions/29873981/error-with-sklearn-calibratedclassifiercv-and-svm
    clf_isotonic = CalibratedClassifierCV(OneVsRestClassifier(clf), cv=5, method='isotonic')
    clf_isotonic.fit(X_train, y_train)
    y_valid_predicted = clf_isotonic.predict_proba(X_valid)
    log_loss_mc(y_valid, y_valid_predicted)


    # non-linear svc using sigmoidal
    # http://stackoverflow.com/questions/29873981/error-with-sklearn-calibratedclassifiercv-and-svm
    # probability=True
    clf = SVC(C=1.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=True, cache_size=2000, class_weight=None, verbose=True, max_iter=-1)
    clf.fit(X_train, y_train)
    y_valid_predicted = clf.predict_proba(X_valid)
    log_loss_mc(y_valid, y_valid_predicted)


    # nusvc, wtf?
    clf = NuSVC(nu=0.5, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=2000, verbose=True, max_iter=-1, random_state=None)
    clf.fit(X_train, y_train)
    prob_pos = clf.decision_function(X_valid)
    prob_pos = \
            (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
    y_valid_predicted = prob_pos
    log_loss_mc(y_valid, y_valid_predicted)
    # http://stackoverflow.com/questions/29873981/error-with-sklearn-calibratedclassifiercv-and-svm
    clf_isotonic = CalibratedClassifierCV(OneVsRestClassifier(clf), cv=5, method='isotonic')
    clf_isotonic.fit(X_train, y_train)
    y_valid_predicted = clf_isotonic.predict_proba(X_valid)
    log_loss_mc(y_valid, y_valid_predicted)


    # nusvc using sigmoidal?
    clf = NuSVC(nu=0.5, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=True, tol=0.001, cache_size=2000, verbose=True, max_iter=-1, random_state=None)
    clf.fit(X_train, y_train)
    y_valid_predicted = clf.predict_proba(X_valid)
    log_loss_mc(y_valid, y_valid_predicted)


    # k means
    clf = KNeighborsClassifier(n_neighbors=9, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None)
    clf.fit(X_train, y_train)
    y_valid_predicted = clf.predict_proba(X_valid)
    log_loss_mc(y_valid, y_valid_predicted)
    clf_isotonic = CalibratedClassifierCV(clf, cv=5, method='isotonic')
    clf_isotonic.fit(X_train, y_train)
    y_valid_predicted = clf_isotonic.predict_proba(X_valid)
    log_loss_mc(y_valid, y_valid_predicted)


    # hyperopt?!
    estim = HyperoptEstimator( classifier=svc('mySVC') )
    estim.fit(X_train, y_train)


    # pca?!
    # http://scikit-learn.org/stable/auto_examples/plot_digits_pipe.html#example-plot-digits-pipe-py
    pca = PCA()
    logistic = LogisticRegression()
    pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
    pipe.fit(X_train, y_train)
    y_valid_predicted = pipe.predict_proba(X_valid)
    log_loss_mc(y_valid, y_valid_predicted)

    # pca + svc
    pca = PCA()
    svc = SVC(probability=False, cache_size=1000, verbose=True)
    pipe = Pipeline(steps=[('pca', pca), ('svc', svc)])
    n_components = [20, 40, 64, 90]
    Cs = np.logspace(-4, 4, 5)
    #gammas = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1, 1]
    gammas = [0.001, 0.005, 0.01, 0.1, 1]
    estimator = GridSearchCV(pipe,
                         dict(pca__n_components=n_components,
                              svc__C=Cs,
                              svc__gamma=gammas), verbose=2)
    estimator.fit(X_train, y_train)
    y_valid_predicted = estimator.predict_proba(X_valid)
    log_loss_mc(y_valid, y_valid_predicted)


    # wow

    from sklearn.preprocessing import MinMaxScaler
    train_size = 0.8
    X_train, X_valid, y_train, y_valid, scaler = load_train_data(train_size=train_size, scale_it=True, square_root_it=False)
    X_test, X_test_ids = load_test_data(scaler=scaler, square_root_it=False)
    full_X_train, _, full_y_train, _, full_scaler = load_train_data(full_train=True, scale_it=True, square_root_it=False)
    X_test_for_full, X_test_ids = load_test_data(scaler=full_scaler, square_root_it=False)

    mm_scaler = MinMaxScaler()
    X_train = mm_scaler.fit_transform(X_train)
    X_valid = mm_scaler.transform(X_valid)

    svc = SVC(probability=False, cache_size=1000, verbose=False)
    gammas = np.exp2([-7, -5, -3, 0, 3, 5, 7])
    Cs = np.exp2([-7, -5, -3, 0, 3, 5, 7])
    pipe = Pipeline(steps=[('svc', svc)])
    estimator = GridSearchCV(pipe,
                         dict(svc__C=Cs,
                              svc__gamma=gammas), verbose=2)
    estimator.fit(X_train, y_train)
    y_valid_predicted = estimator.predict_proba(X_valid)
    log_loss_mc(y_valid, y_valid_predicted)