示例#1
0
 def __init__(self, numFeatures, forestSize, learningRate, regularization):
     self.numFeatures = numFeatures
     self.forestSize = forestSize
     self.learningRate = learningRate
     self.regularization = regularization
     self.forestObject = randomForest(numFeatures, forestSize)
     self.svmObject = svm(numFeatures, learningRate, regularization, 100)
示例#2
0
	def __init__(self):
		self.svm = svm()
		self.param = svm_parameter()
		self.prob = svm_problem()
		self.x_space = None
		self.cross_validation = False
		self.nr_fold = 0
		self.quiet = False
def crossValidateSVM():
    f1Inputs, f1Labels, _ = read_libsvm('data/data_semeion/folds/fold1')
    f2Inputs, f2Labels, _ = read_libsvm('data/data_semeion/folds/fold2')
    f3Inputs, f3Labels, _ = read_libsvm('data/data_semeion/folds/fold3')
    f4Inputs, f4Labels, _ = read_libsvm('data/data_semeion/folds/fold4')
    f5Inputs, f5Labels, _ = read_libsvm('data/data_semeion/folds/fold5')
    allFoldInputArrays = [
        f1Inputs.toarray(),
        f2Inputs.toarray(),
        f3Inputs.toarray(),
        f4Inputs.toarray(),
        f5Inputs.toarray()
    ]
    allFoldLabelArrays = [f1Labels, f2Labels, f3Labels, f4Labels, f5Labels]

    initLearningRates = [10**1, 10**0, 10**-1, 10**-2, 10**-3, 10**-4]
    regularizations = [10**1, 10**0, 10**-1, 10**-2, 10**-3, 10**-4]

    bestLearningRate = None
    bestRegularization = None
    bestAccuracy = 0

    everyAccuracy = []

    for rate in initLearningRates:
        for regularization in regularizations:
            allAccuracies = []
            for i in range(len(allFoldInputArrays)):
                allTrainData = []
                allTrainLabels = []
                for j in range(len(allFoldInputArrays)):
                    if j != i:
                        allTrainData.extend(allFoldInputArrays[j])
                        allTrainLabels.extend(allFoldLabelArrays[j])

                print("Hyperparameters: Learning rate: " + str(rate) +
                      " Regularization: " + str(regularization))

                tempsvm = svm(numFeatures, rate, regularization, 100)
                tempsvm.train(allTrainData, allTrainLabels)
                accuracy = tempsvm.evaluate(allFoldInputArrays[i],
                                            allFoldLabelArrays[i])
                allAccuracies.append(accuracy)
                everyAccuracy.append(accuracy)

            if statistics.mean(allAccuracies) > bestAccuracy:
                bestAccuracy = statistics.mean(allAccuracies)
                bestLearningRate = rate
                bestRegularization = regularization

    avgAccuracy = statistics.mean(everyAccuracy)
    print("Best rate: " + str(bestLearningRate))
    print("Best reg: " + str(bestRegularization))
    print("Best accuracy: " + str(bestAccuracy))
    print("Average accuracy: " + str(avgAccuracy))
def runSVM_CV(dataCV, es):
    # Using current time
    t_st = time.time()

    lrs = [10**0, 10**-1, 10**-2, 10**-3, 10**-4]
    #intiial learning rates
    Cs = [
        10**3,
        10**2,
        10**1,
        10**0,
        10**-1,
        10**-2,
    ]
    #initial tradeoffs
    hps = list(itertools.product(lrs, Cs))
    best_perf = pd.DataFrame(columns=['Ep', 'lr', 'C', 'acc', 'obj'])
    T = 10

    for f in dataCV:
        print('\n Fold -', f)
        dataVal = dataCV[f]['val'].to_numpy()
        data = dataCV[f]['trn']
        acc0 = 0
        # reset accuracy

        for lr, C in hps:  # for learning rates and tradeoff combinations

            # CV training
            w_best, _, lc, obj, losses = svm(data, lr, C, es, T)
            # CV validation
            X = dataVal[:, 1:]
            X = np.hstack((X, np.ones((X.shape[0], 1))))
            # add bias
            y = dataVal[:, 0]
            acc_Val = accuracy(X, y, w_best)  # accuracy(X,y,w):

            if acc_Val > acc0:
                best_perf.loc[f] = [len(lc), lr, C, acc_Val, obj[-1]]
                acc0 = acc_Val

    print('\n -- Best Performance over CV Folds -- ')
    print(best_perf)
    print('\nEarly stop:', es)
    t_en = time.time()
    t_run = np.round((t_en - t_st) / 60, 3)
    print('\nRuntime (m):', t_run)

    return best_perf, t_run
示例#5
0
def runSVM_CV(dataCV):
    # Using current time
    t_st = time.time()

    lrs = [10**0, 10**-1, 10**-2, 10**-3, 10**-4]
    #intiial learning rates
    Cs = [
        10**3,
        10**2,
        10**1,
        10**0,
        10**-1,
        10**-2,
    ]
    #initial tradeoffs
    hps = list(itertools.product(lrs, Cs))
    best_perf = pd.DataFrame(columns=['Ep', 'lr', 'C', 'acc', 'obj'])
    T = 50

    for f in dataCV:
        print('\n Fold -', f)
        data = dataCV[f]
        acc0 = 0
        # reset accuracy

        for lr, C in hps:  # for learning rates and tradeoff combinations

            tau = 0.01 * C
            # early stop threshold
            w_best, best_acc, lc, obj, up = svm(data, lr, C, tau, T)

            if best_acc > acc0:
                best_perf.loc[f] = [len(lc), lr, C, best_acc, obj[-1]]
                acc0 = best_acc

    print('\n -- Best Performance over CV Folds -- \n', best_perf)

    t_en = time.time()
    print('\nRuntime (m):', np.round((t_en - t_st) / 60, 3))

    return best_perf
示例#6
0
def run():
	mod = svm( array([[gauss(0,1)] for i in range(50) ] + [[gauss(8,1)] for i in range(50) ]).reshape([100,1]) )
	
	print "Total Loss: %s" % sum( (mod.Y.reshape( [len(mod.X),]) - mod.cdf( mod.X.reshape( [len(mod.X),]) ) ) ** 2)
	
	fig = plt.figure()
	
	start = -5.
	end = 12.
	X = arange(start,end,.25)
	
	#a = fig.add_subplot(2,2,1)
	#n, bins, patches = a.hist(mod.data, 20, normed=1, facecolor='green', alpha=0.5, label='empirical distribution')
	#a.plot(X,mod.Pr(X), 'r--', label="computed distribution")
	#a.set_title("Computed vs empirical PDF")
	
	c = fig.add_subplot(2,2,2)
	c.plot(numpy.sort(mod.X,0), numpy.sort(mod.Y,0), 'green' )
	c.plot(X, mod.cdf(X), 'r--' )
	c.plot( mod.X, (mod.Y.reshape( [len(mod.X),]) - mod.cdf( mod.X.reshape( [len(mod.X),]) ) ) ** 2, '+' )
	c.set_title("Computed vs emprical CDF")
示例#7
0
from svm import *
from mlp import load_test

if __name__ == "__main__":
    data = load_data()
    train = data[0]
    valid = data[1]

    test = load_test("4-9")

    print "Training phase"
    alpha, b = svm(data[0], data[1], tau=2**-5, C=2**-4)
    print "Testing phase"
    validate((data[0], data[1]), alpha, b, (test[0] / 255, test[1]), 2**-5)
示例#8
0
文件: test_svm.py 项目: sduc/pcml
from svm import *
from mlp import load_test

if __name__ == "__main__":
    data = load_data()
    train = data[0]
    valid = data[1]

    test = load_test("4-9")
    
    print "Training phase"
    alpha,b = svm(data[0],data[1],tau=2**-5,C=2**-4)
    print "Testing phase"
    validate((data[0],data[1]),alpha,b,(test[0]/255,test[1]),2**-5)



示例#9
0
from logit import *
from read_data import *
from svm import *
from hoeffding import *
from random_forest import *

if __name__ == "__main__":
    '''
    Run line by line to avoid confusion of ouptut
    '''
    # Read data
    (train_x, train_y, test_x, test_y) = read_data()

    # Run SVM algorithm
    CI_SVM = svm(train_x, train_y, test_x, test_y)

    # Logist Regression algorithm
    CI_LR = logit(train_x, train_y, test_x, test_y)

    # Random Forrest
    CI_RF = random_forest(train_x, train_y, test_x, test_y)

    print("\n\nFinal Results")
    print("==================================================")

    print("\nHoeffding's Confidence interval for SVM is:")
    print(CI_SVM)

    print("\nHoeffding's Confidence interval for LR is:")
    print(CI_LR)
示例#10
0
def runSVM_trn(dataTrn, lr, C, tau, T):

    w_best, best_acc, lc, obj, losses = svm(data, lr, C, tau, T)

    return w_best, acc0, lc, obj, losses
示例#11
0
def main():
    # data =load data
    datasets = []
    data = pd.read_csv('adult.csv')
    print(data.shape)
    data.count()[1]

    #	print(data.head())
    def cc(x):
        return sum(x == '?')


#	print(data.apply(cc))

    df = data[data.occupation != '?']
    #print(df.shape)

    df = df[df.workclass != '?']
    #print(df.shape)

    df = df[df['native.country'] != '?']
    #print(df.shape)
    #print(df.groupby(by='education')['education.num'].mean())

    df.loc[df['native.country'] != 'United-States',
           'native.country'] = 'non_usa'
    df.loc[df['income'] == '<=50K', 'income'] = -1
    df.loc[df['income'] == '>50K', 'income'] = 1

    features_categorical = [
        'workclass', 'education', 'marital.status', 'occupation',
        'relationship', 'race', 'sex', 'native.country'
    ]
    features_numerical = [
        'age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss',
        'hours.per.week'
    ]

    # convert the categorical features into one-hot encoding
    for feature in features_categorical:
        df1 = pd.get_dummies(df[feature], drop_first=False)
        df = df.drop([feature], axis=1)
        df = df.join(df1)
        print(df.shape)

    # normalize the numerical features by z- normalization
    for feature in features_numerical:
        df[feature] = (df[feature] - df[feature].mean()) / df[feature].std()

    #df['capital.change'] = (df['capital.gain'] > 0) | (df['capital.loss'] >0)
    #df['capital.change'] = df['capital.change'].astype(int)

    print(df.columns)
    print(df.head())

    # first test on hours.per.week, education.num
    df1 = df.drop(['income'], axis=1)
    allX = df1.values
    allX.astype(float)
    ally = df.as_matrix(columns=['income'])

    print(allX.shape, ally.shape)
    X = allX[0:2000]
    y = ally[0:2000]
    myC = 10

    num_ensamble = 10
    classifiers = []
    for i in range(num_ensamble):
        classifier = svm(C=myC, kernel=linear_kernel, gamma=0.05, coef=1)
        classifiers.append(classifier)

    for i in range(num_ensamble):
        X_train, X_val, y_train, y_val = subsample(X, y, 1.0)
        lagr_mult = classifiers[i].fit(X_train, y_train)

        y_pred = classifiers[i].predict(X_val)

        accuracy = get_accuracy(y_val, y_pred)
        print("Out of bag Validation accuracy is {}".format(accuracy))

    # while testing, predict with each svm
    # Take majority vote
    # measure accuracy
    X_test = allX[2001:4000]
    y_test = ally[2001:4000]
    predictions = []
    for i in range(num_ensamble):
        y_pred = classifiers[i].predict(X_test)
        predictions.append(y_pred)

    # do majority vote reduction
    predictions = np.array(predictions)
    pred_t = []
    for i in range(len(X_test)):
        myarray = predictions[:, i].reshape(-1)
        # print(myarray)
        u, indices = np.unique(myarray, return_inverse=True)
        pred_t.append(u[np.argmax(np.bincount(indices))])

    # calculate the accuracy
    accuracy = get_accuracy(pred_t, y_test)
    print("Testing Accuracy is ", accuracy)
示例#12
0
- model = svm.svc() # there is various option associated with it, this is simple for classification. You can refer link, for mo# re detail.

- # Train the model using the training sets and check score

- model.fit(X, y)

- model.score(X, y)

- #Predict Output

- predicted= model.predict(x_test)

R 代码





- library(e1071)

- x <- cbind(x_train,y_train)

- # Fitting model

- fit <-svm(y_train ~ ., data = x)

- summary(fit)

- #Predict Output

- predicted= predict(fit,x_test)
示例#13
0
                      " Regularization: " + str(regularization))

                tempsvm = svm(numFeatures, rate, regularization, 100)
                tempsvm.train(allTrainData, allTrainLabels)
                accuracy = tempsvm.evaluate(allFoldInputArrays[i],
                                            allFoldLabelArrays[i])
                allAccuracies.append(accuracy)
                everyAccuracy.append(accuracy)

            if statistics.mean(allAccuracies) > bestAccuracy:
                bestAccuracy = statistics.mean(allAccuracies)
                bestLearningRate = rate
                bestRegularization = regularization

    avgAccuracy = statistics.mean(everyAccuracy)
    print("Best rate: " + str(bestLearningRate))
    print("Best reg: " + str(bestRegularization))
    print("Best accuracy: " + str(bestAccuracy))
    print("Average accuracy: " + str(avgAccuracy))


crossValidateSVM()

## SVM test:
testSvm = svm(numFeatures, 0.001, 10, 100)
testSvm.train(trainingInputsArr, trainingLabels)
print("SVM training evaluation: ")
print(testSvm.evaluate(trainingInputsArr, trainingLabels))
print("SVM test evaluation: ")
print(testSvm.evaluate(testInputsArr, testLabels))
示例#14
0
testes = [
    Teste("./dataset/cars/car.data", "car", 6, ','),
    Teste("./dataset/mushroom/agaricus-lepiota.data", "mushroom", 0, ","),
    Teste("./dataset/nursery/nursery2.data", "nursery", 8, ',')
]

# Variaveis
numFolds = 10

if len(sys.argv) < 2:
    tstAtl = 2
else:
    tstAtl = int(sys.argv[1])

# Classificador baseado em svm
predSvm, labelsSvm = svm(testes[tstAtl])
printAnalysis(predSvm, labelsSvm, "SVM")

# Classificador baseado em naive-bayes
classifier = NaiveBayesClassifier(testes[tstAtl].separador,
                                  testes[tstAtl].labelPosi)

# limpa as saidas
classifier.cleanOutput()

# classe que gera o arquivo de folds e processa os dados
# parametros = (numFolds, nomeArqSaida, arqEntrada)
dataMinipu = dataManip(numFolds, testes[tstAtl].nomeProb, testes[tstAtl].data,
                       testes[tstAtl].labelPosi, testes[tstAtl].separador)

# processa os dados
示例#15
0

discreteizeData(trainingInputsArr)
discreteizeData(testInputsArr)
discreteizeData(evalInputsArr)

for i in range(len(trainingLabels)):
    if trainingLabels[i] == 0:
        trainingLabels[i] = -1

for i in range(len(testLabels)):
    if testLabels[i] == 0:
        testLabels[i] = -1

## SVM test:
testSvm = svm(numFeatures, 1, 1000, 2000)
testSvm.train(trainingInputsArr, trainingLabels)
# print("SVM training evaluation: ")
# print(testSvm.evaluate(trainingInputsArr, trainingLabels))
# print("SVM test evaluation: ")
# print(testSvm.evaluate(testInputsArr, testLabels))
testSvm.evaluate(evalInputsArr, evalLabels)

## Naive Bayes Test:
# testBayes = naiveBayes(numFeatures, .5)
# testBayes.train(trainingInputsArr, trainingLabels)
# # print(testBayes.evaluate(trainingInputsArr, trainingLabels))
# # print(testBayes.evaluate(testInputsArr, testLabels))
# testBayes.evaluate(evalInputsArr, evalLabels)

## Random Forest test: