Пример #1
0
    gd.fit(X, Y)
    return gd


#SVC(C=10, gamma=0.001, kernel='linear')

if __name__ == "__main__":

    #importing the datasets
    df = pd.read_csv(config.KFOLD_TRAIN_DATA)
    os_df = pd.read_csv(config.OVERSAMPLED_TRAIN_DATA)
    test_df = pd.read_csv(config.TEST_DATA)

    #calculate intial score without hyperparameter tuning
    #using K-stratified K fold
    model = svm.SVC()
    intial_score = score(df, model)
    print(f"Intial roc_auc Score is : {intial_score}")

    #calculter score for test data
    x_test = test_df.drop('fraudulent', axis=1).values
    y_test = test_df.fraudulent.values
    y_pred = model.predict(x_test)
    test_score = metrics.roc_auc_score(y_test, y_pred)
    print(f'Intial Test score : {test_score}')

    #tune hyperparameter and getting best parameters
    params = best_parameter(os_df)
    print(f"Best parameter are {params}")

    #training our training dataset on the best hyperparameter
Пример #2
0
training_data = [data.data[i] for i in training_data]
training_labels = [data.labels[i] for i in data.training_rows]

test_data = [data.data[i] for i in labels]

print("\nWorking with %s, out file prefix %s" % (l_f, filename))
data = np.asarray(training_data)
print("Data shape overall: ", data.shape)

clf = ExtraTreesClassifier()
clf = clf.fit(data, training_labels)
important_features = filename + ".important_features"
with open(important_features, "w") as f:
    for i in range(len(clf.feature_importances_)):
        f.write("%d\t%f\n" % (i, clf.feature_importances_[i]))

model = SelectFromModel(clf, prefit = True)
data_new = model.transform(data)
print("New data shape: ", data_new.shape)

print()

test_data = model.transform(test_data)

svm_clf = svm.SVC()
svm_clf.fit(data_new, training_labels)
predictions = svm_clf.predict(test_data)
with open(filename + ".predict", "w") as f:
    for i in range(len(labels)):
        f.write("%d %d\n" % (labels[i], predictions[i]))
Пример #3
0
    return xx, yy


def plot_contours(ax, clf, xx, yy, **params):
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out


iris = datasets.load_iris()
X = iris.data[:, :2]
y = iris.target

C = 1.0
models = (svm.SVC(kernel='linear', C=C), svm.LinearSVC(C=C),
          svm.SVC(kernel='rbf', gamma=0.7,
                  C=C), svm.SVC(kernel='poly', degree=3, C=C))
models = (clf.fit(X, y) for clf in models)

titles = ('SVC with linear kernel', 'LinearSVC (linear kernel)',
          'SVC with RBF kernel', 'SVC with polynomial (degree 3) kernel')

fig, sub = plt.subplots(2, 2)
plt.subplots_adjust(wspace=0.4, hspace=0.4)

X0, X1 = X[:, 0], X[:, 1]
xx, yy = make_meshgrid(X0, X1)

for clf, title, ax in zip(models, titles, sub.flatten()):
    plot_contours(ax, clf, xx, yy, cmap=plt.cm.coolwarm, alpha=0.8)
Пример #4
0
#clf = GaussianNB()

### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!

from sklearn import svm

estimators = [('scaler', StandardScaler()),
              ('feature_selection', SelectKBest()),
              ('reducer', PCA(random_state=42)), ('svm', svm.SVC())]
pipe = Pipeline(estimators)
param_grid = ([{
    'feature_selection__k': [10, 13, 15, 'all'],
    'reducer__n_components': [2, 4, 6, 8, 10],
    'svm__C': np.logspace(-2, 3, 6),
    'svm__gamma': np.logspace(-4, 1, 6),
    'svm__class_weight': ['balanced', None],
    'svm__kernel': ['rbf', 'sigmoid']
}])

grid_search = GridSearchCV(pipe, param_grid, scoring='precision', cv=sss)
grid_search.fit(features, labels)
#labels_predictions = grid_search.predict(features_test)

clf = grid_search.best_estimator_
train_data_generator = TrainingDataGenerator("Training_data.json",
                                             filter_funct=batch_index_filter_1)
test_data_generator = TrainingDataGenerator("Training_data.json",
                                            filter_funct=batch_index_filter_2)
X_train, y_train = train_data_generator.get_data()
X_test, y_test = test_data_generator.get_data()

# defining a pipeline for calibration purposes
cal_pipeline = create_extraction_pipeline(variance=0.99, n_avgs=2)
cal_pipeline.fit(X_test)

# preprocessing the pulled data
extraction_pipeline = create_extraction_pipeline(variance=0.99, n_avgs=2)
train_extracted_features = extraction_pipeline.fit_transform(X_train)

cal_mean = cal_pipeline.get_params()['std_scaler'].mean_
cur_mean = extraction_pipeline.get_params()['std_scaler'].mean_
new_mean = np.append(cal_mean, cur_mean[cal_mean.shape[0]:])
extraction_pipeline.get_params()['std_scaler'].mean_ = new_mean

test_extracted_features = extraction_pipeline.transform(X_test)

# defining the classifier and getting predictions
poly_clf = svm.SVC(kernel="poly", degree=3, C=1000)
poly_clf.fit(train_extracted_features, y_train)
y_pred = poly_clf.predict(test_extracted_features)

# generating a confusion matrix
print("Accuracy score : ", accuracy_score(y_test, y_pred))
movement_labels = train_data_generator.get_movement_labels()
generate_confusion_matrix(y_pred, y_test, movement_labels)
Пример #6
0
from sklearn import svm
from sklearn.datasets import load_svmlight_files

X_train, y_train, X_test, y_test = load_svmlight_files(
    ('data/ml14fall_train.dat', 'data/ml14fall_test1_no_answer.dat'))

print "read data finished"

poly_clf = svm.SVC(kernel='poly', degree=5)
poly_clf = poly_clf.fit(X_train[:50], y_train[:50])
print "fit model finished"
prediction = poly_clf.predict(X_test[:50])
print prediction


def write_result(pred, result_path):
    result_content = '\n'.join([str(int(p)) for p in pred])
    with open(result_path, 'w') as result:
        result.write(result_content)
    print "result has saved into %s" % result_path


write_result(prediction, 'poly_5_path')
Пример #7
0
#Create list of which pulses were classified incorrectly
inc_ind = [i for i in range(len(y_test)) if y_test[i]!=pred_comb[i]]
incorrect = [(y_test[i],pred_comb[i],clf.pred[1][i],prob[1][i][1],clf.pred[2][i],prob[2][i][1],clf.pred[3][i],prob[3][i][1],clf.pred[4][i],prob[4][i][1]) for i in inc_ind]

#Print diagnostics
print(classification_report(y_test, pred_comb,digits=5))

#Other classifiers I tried:
#Naive Bayes classifier; calculate score
clf_nb = GaussianNB()
score_nb = cross_validation.cross_val_score(clf_nb,X,y,cv = cv)
score_nb_mean = score_nb.mean()
score_nb_std = score_nb.std()

#SVM classifier; calculate score
clf_svm = svm.SVC()
score_svm = cross_validation.cross_val_score(clf_svm,X,y,cv = cv)
score_svm_mean = score_svm.mean()
score_svm_std = score_svm.std()

#Random Forest classifier; calculate score, predicted labels, confusion matrix
clf_rf = RandomForestClassifier()
score_rf = cross_validation.cross_val_score(clf_rf,X,y,cv = cv)
score_rf_mean = score_rf.mean()
score_rf_std = score_rf.std()
clf_rf.fit(X_train,y_train)
pred_rf = clf_rf.predict(X_test)
cm_rf = confusion_matrix(y_test,pred_rf)

#Combined classifier with random forest base; calculate score, predicted labels, confusion matrix
cwbrf_list = [RandomForestClassifier(),
Пример #8
0
print(X.shape)

mean_fpr = np.linspace(0, 1, 100)
tprs = []
aucs = []

k = 0
for train, validation in kfold.split(X, Y):

    # clf = RandomForestClassifier()
    # clf.fit(X[train], Y[train])

    # clf = XGBClassifier()
    # clf.fit(X[train], Y[train])

    clf = svm.SVC(kernel='rbf', probability=True)
    clf.fit(X[train], Y[train])

    # y_score = clf.predict(X_test, 1)
    # Y_pred = clf.predict_proba(X)[:, 1]

    # clf = LogisticRegressionCV(cv=5, penalty='l2', tol=0.0001, fit_intercept=True, intercept_scaling=1,
    #                            class_weight=None, random_state=None,
    #                            max_iter=100, verbose=0, n_jobs=None).fit(X[train], Y[train])

    y_score = clf.predict_proba(X[validation])[:, 1]
    # 评估
    fpr, tpr, threshold = roc_curve(Y[validation], y_score,
                                    pos_label=1)  ###计算真正率和假正率
    roc_auc = auc(fpr, tpr)  ###计算auc的值
    aucs.append(roc_auc)
Пример #9
0
with open('/Users/anshulramachandran/Desktop/all_train.csv',
          newline='') as csvfile:
    filereader = csv.reader(csvfile, delimiter=',')
    for row in filereader:
        trainX.append([float(val) for val in row[1:]])
        trainY.append(int(row[0]) - 1)

with open('/Users/anshulramachandran/Desktop/all_validation.csv',
          newline='') as csvfile:
    filereader = csv.reader(csvfile, delimiter=',')
    for row in filereader:
        testX.append([float(val) for val in row[1:]])
        testY.append(int(row[0]) - 1)

print('start')
clf = svm.SVC(gamma=0.001, C=100.)
clf.fit(trainX, trainY)

train_acc = clf.score(trainX, trainY)
test_acc = clf.score(testX, testY)
print(train_acc, test_acc)

validation_predictions = clf.predict(testX)

# Generate confusion matrix
confusion_matrix = np.zeros(shape=(200, 200))

for i in range(len(testY)):
    class_true = testY[i]
    class_pred = validation_predictions[i]
    confusion_matrix[class_true][class_pred] += 1
Пример #10
0
# ERREUR
erreur.append(metrics.zero_one_loss(ytest, clf_ANN.predict(xtest)))
# PRECISION
score = clf_ANN.score(xtest, ytest)
precision.append(score)
print(" ANN précision : ", score)
# TEMPS
t = time.process_time() - begin
temps.append(t)
print("Temps écoulé ANN : ", t)
cm = confusion_matrix(ytest, clf_ANN.predict(xtest))
print("Matrice de confusion:\n", cm, "\n")

# SVM
begin = time.process_time()
clf_SVM = svm.SVC(kernel='poly', C=0.6)
clf_SVM.fit(xtrain, ytrain)
# ERREUR
erreur.append(metrics.zero_one_loss(ytest, clf_SVM.predict(xtest)))
# PRECISION
score = clf_SVM.score(xtest, ytest)
precision.append(score)
print(" SVM précision : ", score)
# TEMPS
t = time.process_time() - begin
temps.append(t)
print("Temps écoulé SVM : ", t, "\n")
cm = confusion_matrix(ytest, clf_SVM.predict(xtest))
print("Matrice de confusion:\n", cm, "\n")

algo = ['KNN', 'ANN', 'SVM']
Пример #11
0
# kernel function
def gaussian_kernel(x1, x2, sigma):
    return np.exp(- np.power(x1 - x2, 2).sum() / (2 * (sigma ** 2)))


# 2.2 Data Preprocess----------------------------------------------------------------------------
# 2.2.1 load data
mat = sio.loadmat('ex6data2.mat')
data = pd.DataFrame(mat.get('X'), columns=['X1', 'X2'])
data['y'] = mat.get('y')

# 2.2.2 plot
sns.set(context="notebook", style="white", palette=sns.diverging_palette(240, 10, n=2))
sns.lmplot('X1', 'X2', hue='y', data=data,
           size=5,
           fit_reg=False,
           scatter_kws={"s": 10}
          )
# plt.show()

# 2.3 SVM ----------------------------------------------------------------------------------------------
svc = svm.SVC(C=100, kernel='rbf', gamma=10, probability=True)  # non-linear SVM
svc.fit(data[['X1', 'X2']], data['y'])
svc.score(data[['X1', 'X2']], data['y'])  # mean accuracy


predict_prob = svc.predict_proba(data[['X1', 'X2']])[:, 1]  # predict_proba return ndarray (data size, class)
# use [:, 1] or [:, 0] to define the type we want classify out
fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(data['X1'], data['X2'], s=30, c=predict_prob, cmap='Reds')  # c means the type want to classify out
plt.show()
def fit_with_kernel(inFeatures, inLabels, inKernel="rbf", maxIter=-1):
    toReturn = svm.SVC(kernel=inKernel, max_iter=maxIter)
    toReturn.fit(inFeatures, inLabels)
    return toReturn
Пример #13
0
def Train_And_Test_Image_Classifier(split):
    phone_images = []
    for image_file in [
            img_f for img_f in os.listdir(".")
            if img_f.startswith("yes") and img_f.endswith("jpg")
    ]:
        image = imageio.imread(image_file)
        image = img_as_float(image)
        image = rgb2gray(image)
        image_prewitt = prewitt(image)
        phone_images.append([image, image_prewitt])

    n_phone_images = len(phone_images)
    #split phone images into training and testing sets
    factor_pi = int(n_phone_images / 3)
    training_phone_images = []
    testing_phone_images = []

    if split == 0:
        ##The last 1/3
        training_phone_images = phone_images[:factor_pi * 2]
        testing_phone_images = phone_images[factor_pi * 2:]
    elif split == 1:
        ##The first 1/3
        training_phone_images = phone_images[factor_pi:]
        testing_phone_images = phone_images[:factor_pi]
    else:
        ##The middle 1/3
        training_phone_images = phone_images[:factor_pi] + phone_images[
            factor_pi * 2:]
        testing_phone_images = phone_images[factor_pi:factor_pi * 2]

    non_phone_images = []
    for image_file in [
            img_f for img_f in os.listdir(".")
            if img_f.startswith("no") and img_f.endswith("jpg")
    ]:
        image = imageio.imread(image_file)
        image = img_as_float(image)
        image = rgb2gray(image)
        image_prewitt = prewitt(image)
        non_phone_images.append([image, image_prewitt])

    n_non_phone_images = len(non_phone_images)
    #split none phone images into training and testing sets
    factor_npi = int(n_non_phone_images / 3)
    training_non_phone_images = []
    testing_non_phone_images = []

    if split == 0:
        ##The last 1/3
        training_non_phone_images = non_phone_images[:factor_npi * 2]
        testing_non_phone_images = non_phone_images[factor_npi * 2:]
    elif split == 1:
        ##The first 1/3
        training_non_phone_images = non_phone_images[factor_npi:]
        testing_non_phone_images = non_phone_images[:factor_npi]
    else:
        ##The middle 1/3
        training_non_phone_images = non_phone_images[:
                                                     factor_npi] + non_phone_images[
                                                         factor_npi * 2:]
        testing_non_phone_images = non_phone_images[factor_npi:factor_npi * 2]

    training_set = training_phone_images + training_non_phone_images
    training_set_output = [1] * len(training_phone_images) + [0] * len(
        training_non_phone_images)

    testing_set = testing_phone_images + testing_non_phone_images
    testing_set_output = [1] * len(testing_phone_images) + [0] * len(
        testing_non_phone_images)

    n_training_set = len(training_set)
    training_set = np.array(training_set)
    training_set = training_set.reshape(n_training_set, -1)

    n_testing_set = len(testing_set)
    testing_set = np.array(testing_set)
    testing_set = testing_set.reshape(n_testing_set, -1)

    classifier = svm.SVC(C=100, probability=True, random_state=0)
    classifier.fit(training_set, training_set_output)
    pickle.dump(classifier, open("cellphone_image_classifier.sav", "wb"))

    predicted = classifier.predict(testing_set)
    print(classifier.score(testing_set, testing_set_output))

    i = 0
    while i < len(testing_set_output):
        if predicted[i] != testing_set_output[i]:
            print(i, predicted[i], testing_set_output[i])
        i += 1

    print("Classification report for classifier %s:\n%s\n" %
          (classifier,
           metrics.classification_report(testing_set_output, predicted)))

    predict_prob = classifier.predict_proba(testing_set)
    i = 0
    while i < len(testing_set_output):
        if predict_prob[i][0] > predict_prob[i][1]:
            if testing_set_output[i] != 0:
                print(i, "0: ", predict_prob[i][0], "1: ", predict_prob[i][1],
                      " shouldbe:1")
        else:
            if testing_set_output[i] != 1:
                print(i, "0: ", predict_prob[i][0], "1: ", predict_prob[i][1],
                      " shouldbe:0")
        i += 1
    print(" ")
Пример #14
0
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
import numpy as np
from sklearn import svm
import matplotlib.pyplot as plt

iris = load_iris()
pca = PCA(n_components=2)
data = pca.fit(iris.data).transform(iris.data)
print(data.shape)
datamax = data.max(axis=0) + 1
datamin = data.min(axis=0) - 1
print(datamax)
print(datamin)
n = 2000
X, Y = np.meshgrid(np.linspace(datamin[0], datamax[0], n),
                   np.linspace(datamin[1], datamax[1], n))
svc = svm.SVC()
svc.fit(data, iris.target)
Z = svc.predict(np.c_[X.ravel(), Y.ravel()])
print(np.unique(Z))
plt.contour(X, Y, Z.reshape(X.shape), levels=[0, 1], colors=['r', 'g'])
# plt.show()
for i, c in zip([0, 1, 2], ['r', 'g', 'b']):
    d = data[iris.target == i]
    plt.scatter(d[:, 0], d[:, 1], c=c)
plt.show()
Пример #15
0
# replace all missing values(?) with -99999
df.replace('?', -99999, inplace=True)
# drop id column since it is not a useful feature
df.drop(['id'], 1, inplace=True)

#input data
X = np.array(df.drop(['class'], 1))
# output data
y = np.array(df['class'])

# Split input data to training and test data
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X, y, test_size=0.2)

# initialize k means classifier
clf = svm.SVC()

# train the classifier
clf.fit(X_train, y_train)

# find the accuracy
accuracy = clf.score(X_test, y_test)
print(accuracy)

# unknown sample for prediction
example_measures = np.array([[4, 2, 1, 1, 1, 2, 3, 2, 1],
                             [4, 2, 1, 1, 1, 2, 3, 2, 1]])
# avoid deprecation errors, len defines the number of samples
example_measures = example_measures.reshape(len(example_measures), -1)

# prediction
"""
This module sets up different machines for machine learing algorithms based on the needs for the user. Set up as a library to import with mirrored commands for each machine.
"""

#This is the Support Vector Machine Section

from sklearn import svm
SVM = svm.SVC()


def SVMfit(x, y):
    """
        Input: x, y
        x (Array): An array of training points for the svm to set up an algorithm.
        y (Array): An array of values for their corresponding training point.
        Returns: NA
        Description: Sets the svm with an algorithm to predict the input data values.
        """
    SVM.fit(x, y)


def SVMpredict(x):
    """
        Input: x
        x (Array): An array of a data point to predict the value of.
        Returns: The predicted value of the data point.
        Description: Uses a svm to predict the value of the input data point.
        """
    return SVM.predict(x)



#模型选取的特征
select_feature_list = ['call_count_per_day', 'phone_loan_times_per_platform',\
                'idcard_loan_platform_num', 'idcard_loan_times_per_platform',\
                'call_count', 'sustained_days', 'gender']


#创建学习模型
rf = RF(n_estimators = 40)
ada_tree = Ada(n_estimators = 40)
lr = LR()
nb1 = MultinomialNB()
nb2 = GaussianNB()
s_v_m = svm.SVC(C = 1)

ada_lr = Ada(base_estimator = LR(),n_estimators = 40,algorithm='SAMME')
ada_nb2 = Ada(base_estimator = GaussianNB(),n_estimators = 40,algorithm='SAMME')
ada_svm = Ada(base_estimator = svm.SVC(),n_estimators = 40,algorithm='SAMME')

#返回模型准确率
def model_estimate(clf, X, y, select_feature_list):
    result = []
    for i in range(10):
        num = int(len(y)*0.7)
        random_index = np.random.permutation(len(y))
        build_index = random_index[:num]
        test_index = random_index[num:]
        X_build = X.iloc[build_index].copy()
        y_build = y.iloc[build_index]
Пример #18
0
housing_pct['US_HPI_future'] = housing_pct['United States'].shift(-1)
housing_pct.dropna(inplace=True)

housing_pct['label'] = list(map(create_labels, housing_pct['United States'], housing_pct['US_HPI_future']))

# housing_pct['ma_apply_example'] = housing_pct['M30'].rolling(window=10).apply(moving_average)
# print(housing_pct.tail())
X = np.array(housing_pct.drop(['label', 'US_HPI_future'], 1))
y = np.array(housing_pct['label'])

X = scale(X)

X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# clf = svm.SVC(kernel='linear')
# clflog = LogisticRegression(C=50.0, dual=False, penalty="l1")
clflog_accuracy = []
clfsvm_accuracy = []

for i in range(10):
	clflog = LogisticRegression(C=49.0, dual=False, penalty="l1")
	clflog.fit(X_train, y_train)
	clflog_accuracy.append(clflog.score(x_test,y_test))

	clfsvm = svm.SVC(kernel='linear')
	clfsvm.fit(X_train, y_train)
	clfsvm_accuracy.append(clfsvm.score(x_test,y_test))

print('Accuracy of logistic regression = %0.4f' % (mean(clflog_accuracy) * 100))
print('Accuracy of support vector machine = %0.4f' % (mean(clfsvm_accuracy) * 100))
Пример #19
0
        X = hf['x'][:]
        Y = hf['y'][:]
    return X, Y

x,y = load_h5py('Data/data_3.h5')

print y


max=-1
maxindex=-1
res=np.zeros(x.shape[0],np.amin(y)-np.amax(y)+1)
for i in range(np.amin(y),np.amax(y)+1):

    print i
    y_train=np.zeros(y.shape)
    for j in range(0,y.shape[0]):
        if(y[j]==i):
            y_train[j]=1
        else:
            y_train[j]=0
#    print y_train
    model=svm.SVC(kernel='linear')
    model.fit(x,y_train)
    m=model.coef_
    res.append(np.dot(x,m.T))

print res[0]

for i in range(0,x.shape[0]):
Пример #20
0
                for finger in fingerPos:
                    if finger[0] < fingerPos[shortIndex][0]:
                        shortIndex = n
                    if finger[0] > fingerPos[longIndex][0]:
                        longIndex = n
                    n += 1
                longestDist = hr.dist(fingerPos[shortIndex][0], fingerPos[longIndex][0], fingerPos[shortIndex][1], fingerPos[longIndex][1])
            
                sampleData = [numFingers,fingerSum,longestDist]
                samples.append(sampleData)
                labels.append(label_array[image])

    
    #make svm
    scaler = preprocessing.StandardScaler().fit(samples)
    model = svm.SVC()
    model.fit(scaler.transform(samples),labels)
            
    #run test on reserved subset
    for image in sets[test_index]:
        im = cv.imread('image_' + str(image) + '.jpeg')
        segIm = hr.segment(im)
        palmPos, fingerPos, fingerLen = hr.extract(im, segIm)
                
        #Normalized FingerLength Feature
        numFingers = (len(fingerPos) - scaler.mean_[0]) / scaler.scale_[0]
    
        #Normalized FingerSum Feature
        fingerSum = 0
        for length in fingerLen:
            fingerSum = fingerSum + length
Пример #21
0
    'random_state': 42,
    'gamma': 0
}

model = XGBClassifier(grid, weights=class_weights)
model.fit(X_train, Y_train)
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(Y_test, y_pred)
print("XGBClassifier Accuracy: %.2f%%" % (accuracy * 100.0))
toc = time.perf_counter()
print("XGBClassifier runtime: %.3f seconds" % (toc - tic))

tic = time.perf_counter()
model = svm.SVC(class_weight='balanced')
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(Y_test, y_pred)
print("SVM Accuracy: %.2f%%" % (accuracy * 100.0))
toc = time.perf_counter()
print("SVM runtime: %.3f seconds" % (toc - tic))

from sklearn.neighbors import KNeighborsClassifier

tic = time.perf_counter()
knn = KNeighborsClassifier(n_neighbors=60)
knn.fit(X_train, Y_train)
y_pred = knn.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred)
Пример #22
0
 def __init__(self, class_num, x_train_raw, y_train_raw):
     self.tfidf = TfidfVectorizor(x_train_raw)
     x_train = [self.tfidf.process(sent) for sent in x_train_raw]
     y_train = y_train_raw
     self.svm = svm.SVC(decision_function_shape="ovr")
     self.svm.fit(x_train, y_train)
# # # 1.2) Feature Extraction (Textual Features)

# # The terms' weights were calculated using the Term Frequency - Inverse Document Frequency (TF-IDF)
tfidf_vect = TfidfVectorizer(analyzer='word',
                             token_pattern=r'\w{1,}',
                             max_features=50000)
tfidf_vect.fit(x_text)
x_text_tfidf = tfidf_vect.transform(x_text)

# 1.3) Feature Selection (Textual Features)

# Feature selection  using a chi-square score was applied  for each applied machine learning algorithm to select relevant textual features.

# COMMENT OUT following code block for experimenting different feature sizes for each classifier
clf = clf = svm.SVC()
for x in range(5, 23, 15):
    test = SelectKBest(score_func=chi2, k=x)
    fit = test.fit(x_sm, y)
    x_s = fit.transform(x_sm)
    scores = cross_val_score(clf, x_s, y, cv=10)
    # print(scores)
test = SelectKBest(score_func=chi2, k=15)
fit = test.fit(x_sm, y)
x_s = fit.transform(x_sm)

clf = svm.SVC()
for x in range(500, 4000, 500):
    test = SelectKBest(score_func=chi2, k=x)
    fit = test.fit(x_text_tfidf, y)
    x_t = fit.transform(x_text_tfidf)
Пример #24
0
import numpy as np
from sklearn import cross_validation
from sklearn import datasets
from sklearn import svm
diabets = datasets.load_diabetes()
X_train, X_test, y_train, y_test = \
cross_validation.train_test_split(
diabets.data, diabets.target, test_size=0.2, random_state=0)
print (X_train.shape, y_train.shape) # test size 20%
print (X_test.shape, y_test.shape)
clf = svm.SVC(kernel='linear', C=1)
scores = cross_validation.cross_val_score(
clf, diabets.data, diabets.target, cv=4) # 4-folds
print (scores)
print("Accuracy: %0.2f (+/- %0.2f)" %(scores.mean(), scores.std()))
# wow, thats a a very shitty accuracy score
#Ağdaki(mesh) adım boyutu
h = .02

y_30 = np.copy(y)
y_30[rand.rand(len(y)) < 0.3] = -1

y_50 = np.copy(y)
y_50[rand.rand(len(y)) < 0.8] = -1

#SVM (not scaled cuz we want to plot the support vectors)
ls30 = (label_propagation.LabelSpreading().fit(X, y_30), y_30)
ls50 = (label_propagation.LabelSpreading().fit(X, y_50), y_50)
ls100 = (label_propagation.LabelSpreading().fit(X, y), y)

rbf_svc = (svm.SVC(kernel='rbf', gamma = 0.5).fit(X, y), y)

#Create mesh to plot in
x_min, x_max = X[:,0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:,1].min() - 1, X[:, 1].max() + 1

xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

#title for plots
titles = ['LS %30 data', 'LS %50 data',
          'LS %100 data', 'SVC with RBF']

color_map = {-1: (1, 1, 1), 0:(0,0,0.9),
             1: (1,0,0), 2: (0.8, 0.6, 0)}

for i, (clf, y_train) in enumerate((ls30, ls50, ls100, rbf_svc)):
Пример #26
0
            testing = pd.read_csv('../output/fit_prediction_{}_{}_of_2'.format(
                pathway, fold_num),
                                  delimiter='\t')

            if drop_cols:
                ncols = training.shape[1]
                training.drop(drop_cols, axis=1, inplace=True)
                assert training.columns.shape[0] == ncols - len(drop_cols)

                ncols = testing.shape[1]
                testing.drop(drop_cols, axis=1, inplace=True)
                assert testing.columns.shape[0] == ncols - len(drop_cols)

            # fit SVM on interactome_train
            model = svm.SVC(kernel=kernel,
                            probability=True,
                            class_weight=class_weight)

            train_on = training.ix[:,
                                   ~training.columns.isin(['name', 'class'])]

            test_on = testing.ix[:, ~testing.columns.isin(['name', 'class'])]

            model.fit(train_on, training['class'])
            # save prediction as confidence, not as class
            predicted_probab = model.predict_proba(test_on)

            # reformat name such that it is #tail head score
            predicted_probab_df = pd.DataFrame(predicted_probab)
            predicted_df = testing['name'].\
                str.split('_to_', expand=True)
Пример #27
0
for index, (image, label) in enumerate(images_and_labels[:4]):
    plt.subplot(2, 4, index + 1)
    plt.axis('off')
    plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    plt.title('Training: %i' % label)

# To apply a classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
print digits.images
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))
print data
digits.images

# Create a classifier: a support vector classifier
classifier = svm.SVC(gamma=0.001)

# We learn the digits on the first half of the digits
classifier.fit(data[:n_samples / 2], digits.target[:n_samples / 2])

#persistence model
s = pickle.dumps(classifier)
joblib.dump(classifier, 'perPredict.pkl')

# Now predict the value of the digit on the second half:
expected = digits.target[n_samples / 2:]
predicted = classifier.predict(data[n_samples / 2:])

print("Classification report for classifier %s:\n%s\n" %
      (classifier, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
Пример #28
0
# Binarize the output
y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]

# Add noisy features to make the problem harder
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]

# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
                                                    random_state=0)

# Learn to predict each class against the other
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
                                 random_state=random_state))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

plt.figure()
sys.path.append("../tools/")
from email_preprocess import preprocess
from sklearn import svm

### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()

#########################################################
### your code goes here ###

#features_train = features_train[:len(features_train)/100]
#labels_train = labels_train[:len(labels_train)/100]

clf = svm.SVC(kernel='rbf', C=10000)

t0 = time()
clf.fit(features_train, labels_train)
print "training time:", round(time() - t0, 3), "s"

t0 = time()
print "Score of naive bayes algorithm:", clf.score(features_test, labels_test)
print "Score time:", round(time() - t0, 3), "s"

pred = clf.predict(features_test)

n = 0
for m in pred:
    if m == 1:
        n += 1
Пример #30
0
pca_a = PCA(n_components=reducedDim_a)
pca_a.fit(training_data_proso)
# Transform training_data and testing data respectively
training_data_proso_transformed = pca_a.transform(training_data_proso)
testing_data_proso_tansformed = pca_a.transform(testing_data_proso)

# Concatenate ‘video training_data’ and ‘audio training_data’ into a new feature ‘combined_trainingData’
sample_train = np.concatenate(
    (training_data_transformed, training_data_proso_transformed), axis=1)

# Concatenate ‘video testing_data’ and ‘audio testing_data2 into a new feature ‘combined_testingData’.
sample_test = np.concatenate(
    (testing_data_transformed, testing_data_proso_tansformed), axis=1)

# Train SVM classifier
clf = svm.SVC(kernel='linear')
clf.fit(sample_train, training_class)

# The prediction results of training data and testing data respectively
pred_train = clf.predict(sample_train)
pred_test = clf.predict(sample_test)

# Calculate and Print the training accuracy and testing accuracy.
print('training accuracy: {}'.format(
    accuracy_score(training_class, pred_train, normalize=True)))
print('testing accuracy: {}'.format(
    accuracy_score(testing_class, pred_test, normalize=True)))

print('confusion matrix training:\n{}'.format(
    confusion_matrix(training_class, pred_train)))
print('confusion matrix testing:\n{}'.format(