def CVF(hp, decaying=False, averaged=False):
    folds = []
    for i in range(5):
        file = "./CVfolds/fold" + str(i + 1)
        cv_x_train, cv_y_train, cv_num_features = libsvm.read_libsvm(file)
        if decaying:
            cv_w, cv_b, cv_hist, accs = train(cv_x_train.toarray(),
                                              cv_y_train,
                                              epochs=10,
                                              lr=hp,
                                              decaying=True)
        elif averaged:
            cv_w, cv_b, cv_hist, accs = train(cv_x_train.toarray(),
                                              cv_y_train,
                                              epochs=10,
                                              lr=hp,
                                              averaged=True)
        else:
            cv_w, cv_b, cv_hist, accs = train(cv_x_train.toarray(),
                                              cv_y_train,
                                              epochs=10,
                                              lr=hp)
        temp = 0
        for j in range(5):
            if i != j:
                f = "./CVfolds/fold" + str(j + 1)
                x, y, _ = libsvm.read_libsvm(f, cv_num_features)
                temp += accuracy(x.toarray(), y, cv_w, cv_b)
        folds.append(temp / 4)
    return sum(folds) / len(folds)
    pass
def setup_data(data_set):
    if data_set == "s":
        xtr, ytr, nf = libsvm.read_libsvm("../Kaggle/data/data-splits/data.train")
        xt, yt, _ = libsvm.read_libsvm("../Kaggle/data/data-splits/data.test", nf)
        return xtr, ytr, nf, xt, yt

    elif data_set == "m":
        xtr, ytr, nf = libsvm.read_libsvm("data_madelon/madelon_data_train")
        xt, yt, _ = libsvm.read_libsvm("data_madelon/madelon_data_test", nf)
        return xtr, ytr, nf, xt, yt
def setup_data(data_set):
    if data_set == "s":
        xtr, ytr, nf = libsvm.read_libsvm("data_semeion/hand_data_train")
        xt, yt, _ = libsvm.read_libsvm("data_semeion/hand_data_test", nf)
        return xtr, ytr, nf, xt, yt

    elif data_set == "m":
        xtr, ytr, nf = libsvm.read_libsvm("data_madelon/madelon_data_train")
        xt, yt, _ = libsvm.read_libsvm("data_madelon/madelon_data_test", nf)
        return xtr, ytr, nf, xt, yt
예제 #4
0
def crossValidateSVM():
    f1Inputs, f1Labels, _ = read_libsvm('data/data_semeion/folds/fold1')
    f2Inputs, f2Labels, _ = read_libsvm('data/data_semeion/folds/fold2')
    f3Inputs, f3Labels, _ = read_libsvm('data/data_semeion/folds/fold3')
    f4Inputs, f4Labels, _ = read_libsvm('data/data_semeion/folds/fold4')
    f5Inputs, f5Labels, _ = read_libsvm('data/data_semeion/folds/fold5')
    allFoldInputArrays = [
        f1Inputs.toarray(),
        f2Inputs.toarray(),
        f3Inputs.toarray(),
        f4Inputs.toarray(),
        f5Inputs.toarray()
    ]
    allFoldLabelArrays = [f1Labels, f2Labels, f3Labels, f4Labels, f5Labels]

    initLearningRates = [10**1, 10**0, 10**-1, 10**-2, 10**-3, 10**-4]
    regularizations = [10**1, 10**0, 10**-1, 10**-2, 10**-3, 10**-4]

    bestLearningRate = None
    bestRegularization = None
    bestAccuracy = 0

    everyAccuracy = []

    for rate in initLearningRates:
        for regularization in regularizations:
            allAccuracies = []
            for i in range(len(allFoldInputArrays)):
                allTrainData = []
                allTrainLabels = []
                for j in range(len(allFoldInputArrays)):
                    if j != i:
                        allTrainData.extend(allFoldInputArrays[j])
                        allTrainLabels.extend(allFoldLabelArrays[j])

                print("Hyperparameters: Learning rate: " + str(rate) +
                      " Regularization: " + str(regularization))

                tempsvm = svm(numFeatures, rate, regularization, 100)
                tempsvm.train(allTrainData, allTrainLabels)
                accuracy = tempsvm.evaluate(allFoldInputArrays[i],
                                            allFoldLabelArrays[i])
                allAccuracies.append(accuracy)
                everyAccuracy.append(accuracy)

            if statistics.mean(allAccuracies) > bestAccuracy:
                bestAccuracy = statistics.mean(allAccuracies)
                bestLearningRate = rate
                bestRegularization = regularization

    avgAccuracy = statistics.mean(everyAccuracy)
    print("Best rate: " + str(bestLearningRate))
    print("Best reg: " + str(bestRegularization))
    print("Best accuracy: " + str(bestAccuracy))
    print("Average accuracy: " + str(avgAccuracy))
def CVF(hp):
    folds=[]
    for i in range(5):
        file = "./data/cvf/fold"+str(i+1)
        cv_x_train, cv_y_train, cv_num_features = libsvm.read_libsvm(file)
        cv_w, cv_b, cv_hist, accs = train(cv_x_train.toarray(), cv_y_train, epochs=10, lr=hp)
        temp = 0
        for j in range(5):
            if i != j:
                f = "./data/cvf/fold"+str(j+1)
                x, y,_ = libsvm.read_libsvm(f, cv_num_features)
                temp += accuracy(x.toarray(), y, cv_w, cv_b)
        folds.append(temp/4)
    return sum(folds)/len(folds)
    pass 
def cvf(data_set, lr):
    acc = []
    path = ""
    if data_set == "s":
        path = "data_semeion/folds"
    else:
        path = "data_madelon/folds"

    for i in range(1, 5):
        file = path + "/fold" + str(i)
        cvxtrain, cvytrain, cvnum = libsvm.read_libsvm(file)
        w, a = train(cvxtrain.toarray(), cvytrain, lr)
        rate = 0
        for j in range(1, 5):
            if i != j:
                f = path + "/fold" + str(j + 1)
                x, y, _ = libsvm.read_libsvm(f, cvnum)
                x = add_bias(x.toarray())
                rate += accuracy(x, y, w)
        acc.append(rate / 4)
    return sum(acc) / len(acc)
예제 #7
0
def train_test():
    X_train, y_train, num_features = libsvm.read_libsvm(
        "C:\\Users\\Abhi\\Documents\\MyFiles\\AB\\GradSchool\\Fall19\\ML\\Project"
        "\\data\\data\\data-splits\\data.train")
    # log transforming
    X_train = X_train.log1p()
    # X_train_array = np.array([])
    # print("Number of features =", num_features)

    # adding all sparse matrices to a numpy array
    # for x in X_train:
        # X_train_array = np.append(X_train_array, x)

    simple_perceptron = Perceptron(num_features)
    decay_perceptron = Perceptron(num_features)
    average_perceptron = Perceptron(num_features)
    pocket_perceptron = Perceptron(num_features)
    margin_perceptron = Perceptron(num_features)

    learning_rates = [0.01, 0.1, 1]
    margins = [1, 0.1, 0.01]

    print("Accuracies on training set for 20 epochs")
    print("--" * 50)

    print("Simple Perceptron")
    print("--" * 50)
    for r in learning_rates:
        w, b = simple_perceptron.simple_perceptron_train(X_train, y_train,
                                                         epochs=20, lr=r)
        print("Simple perceptron accuracy on training set for lr =", r, ":",
              simple_perceptron.accuracy(X_train, y_train, w, b) * 100)
    # print()
    # print("Decay Perceptron")
    # print("--" * 50)
    # for r in learning_rates:
    #     w, b = decay_perceptron.decaying_perceptron_train(X_train_array, y_train, epochs=20, lr=r)
    #     print("Decay perceptron accuracy on training set for lr =", r, ":",
    #           decay_perceptron.accuracy(X_train_array, y_train, w, b) * 100)
    print()
    print("Averaged Perceptron")
    print("--" * 50)
    for r in learning_rates:
        w, b = average_perceptron.averaged_perceptron_train(X_train, y_train, epochs=20, lr=r)
        print("Average perceptron accuracy on training set for lr =", r, ":",
              average_perceptron.accuracy(X_train, y_train, w, b) * 100)
예제 #8
0
        id_list.append(line.strip())

    return id_list


def discretize(data):

    for i in range(0, data.shape[0]):
        threshold = data[i].mean()
        binarizer = preprocessing.Binarizer(threshold).fit(data)
        data[i] = binarizer.transform(data[i])


if __name__ == "__main__":

    X_train, y_train, num_features = libsvm.read_libsvm(
        "../data/data/data-splits/data.train")

    X_test, y_test, num_features = libsvm.read_libsvm(
        "../data/data/data-splits/data.test")

    X_anon, y_anon, num_features = libsvm.read_libsvm(
        "../data/data/data-splits/data.eval.anon")

    X_train = X_train.log1p()
    X_test = X_test.log1p()
    X_anon = X_anon.log1p()

    print("Discretizing...")
    discretize(X_train)
    discretize(X_test)
    discretize(X_anon)
예제 #9
0
def read_data_to_array(path):
    X_train, y_train, num_features = read_libsvm(path)
    X_train = X_train.toarray()
    return X_train, y_train
예제 #10
0
combineFoldsNames = [
    "fold1234", "fold1235", "fold1245", "fold1345", "fold2345"
]
singleFoldNames = ["fold5", "fold4", "fold3", "fold2", "fold1"]

for CINDEX in range(6):
    for LRINDEX in range(6):
        for i in range(3):
            size = SizeOfTheForest[i]
            accuracies = []
            for j in range(5):
                combineFoldName = combineFoldsNames[j]
                singleFoldName = singleFoldNames[j]
                _indexCollection = []
                _root = []
                X_train, y_train, num_features = read_libsvm(combineFoldName)
                x = X_train.todense()
                for y in range(size):
                    _index = []
                    for z in range(50):
                        _index.append(random.randint(0, 359))
                    formatFile(x, y_train, _index, 6075)
                    trainData = np.loadtxt('fileFormated',
                                           delimiter=',',
                                           dtype=str)
                    trainData_obj = Data(data=trainData)
                    attributesSet = trainData_obj.attributes
                    root = id3Depth(attributesSet, trainData_obj, 1)
                    _root.append(root)
                    _indexCollection.append(_index)
예제 #11
0
printEpochs = False
BestC = 10

##########################################################################################################

print("Cross Validation")
print()
C = learningRates[0]
for i in range(6):
    learningRate = learningRates[i]
    accuracies = []

    for j in range(5):
        combineFoldName = combineFoldsNames[j]
        singleFoldName = singleFoldNames[j]
        X_train, y_train, num_features = read_libsvm(combineFoldName)
        X_test, y_test, _ = read_libsvm(singleFoldName, num_features)
        w,b = SimplePerceptron(X_train.todense(), y_train, 10, learningRate, C)
        accuracies.append(accuracy(X_test.todense(),y_test,w,b))

    sum = 0
    for j in range(5):
        sum += accuracies[j]
    average = sum / 5
    print("Learning Rate:",learningRate,"Average: ",average,"C:",C)

    if average > bestAccuracy:
        bestLearningRate = learningRate
        BestC = C
        bestAccuracy = average
def setup():
    X_train, y_train, num_features = libsvm.read_libsvm('data_train')
    X_test, y_test, _ = libsvm.read_libsvm('data_test', num_features)
    return X_train, y_train, num_features, X_test, y_test
예제 #13
0

##########################################################################################################

bestLearningRate = 0
bestAccuracy = 0
learningRates = [1, 0.1, 0.01]
combineFoldsNames = [
    "fold1234", "fold1235", "fold1245", "fold1345", "fold2345"
]
singleFoldNames = ["fold5", "fold4", "fold3", "fold2", "fold1"]
DecayingTheLearningRateHyperParameter = 0
DecayingTheLearningRateEpoch = 0
printEpochs = False

X_train, y_train, num_features = read_libsvm('data_train')
X_test, y_test, _ = read_libsvm('data_test', num_features)

##########################################################################################################

print("Cross Validation")
print()

##########################################################################################################

bestLearningRate = 0
bestAccuracy = 0

for i in range(3):
    learningRate = learningRates[i]
    accuracies = []
예제 #14
0
def get_id_list():

    id_list = []
    with open(r"../data/data/data-splits/eval.id") as file:
        lines = file.readlines()

    for line in lines:
        id_list.append(line.strip())

    return id_list


enc = KBinsDiscretizer(n_bins=4, encode="onehot", strategy="uniform")

X_train, y_train, num_features = libsvm.read_libsvm(
    "../data/data/data-splits/data.train")
X_train_binned = enc.fit_transform(X_train.toarray())

X_test, y_test, num_features = libsvm.read_libsvm(
    "../data/data/data-splits/data.test")
X_test_binned = enc.fit_transform(X_test.toarray())

X_test_eval, y_test_eval, num_features = libsvm.read_libsvm(
    "../data/data/data-splits/data.eval.anon")

X_eval_binned = enc.fit_transform(X_test_eval.toarray())
id_list = get_id_list()

print("*" * 50 + "DTREE" + "*" * 50)
depths = [2]
print("Accuracies for non binned data")
예제 #15
0
from libsvm import read_libsvm
from svm import *
import statistics

## Setup Data:
trainingInputs, trainingLabels, numFeatures = read_libsvm(
    'data/data_semeion/hand_data_train')
testInputs, testLabels, _ = read_libsvm('data/data_semeion/hand_data_test',
                                        numFeatures)
trainingInputsArr = trainingInputs.toarray()
testInputsArr = testInputs.toarray()


def crossValidateSVM():
    f1Inputs, f1Labels, _ = read_libsvm('data/data_semeion/folds/fold1')
    f2Inputs, f2Labels, _ = read_libsvm('data/data_semeion/folds/fold2')
    f3Inputs, f3Labels, _ = read_libsvm('data/data_semeion/folds/fold3')
    f4Inputs, f4Labels, _ = read_libsvm('data/data_semeion/folds/fold4')
    f5Inputs, f5Labels, _ = read_libsvm('data/data_semeion/folds/fold5')
    allFoldInputArrays = [
        f1Inputs.toarray(),
        f2Inputs.toarray(),
        f3Inputs.toarray(),
        f4Inputs.toarray(),
        f5Inputs.toarray()
    ]
    allFoldLabelArrays = [f1Labels, f2Labels, f3Labels, f4Labels, f5Labels]

    initLearningRates = [10**1, 10**0, 10**-1, 10**-2, 10**-3, 10**-4]
    regularizations = [10**1, 10**0, 10**-1, 10**-2, 10**-3, 10**-4]
예제 #16
0
from libsvm import read_libsvm
from svm import *
import statistics

## Setup Data:
trainingInputs, trainingLabels, numFeatures = read_libsvm('data/data_madelon/madelon_data_train')
testInputs, testLabels, _ = read_libsvm('data/data_madelon/madelon_data_test', numFeatures)
trainingInputsArr = trainingInputs.toarray()
testInputsArr = testInputs.toarray()


def crossValidateSVM():
    f1Inputs, f1Labels, _ = read_libsvm('data/data_madelon/folds/fold1')
    f2Inputs, f2Labels, _ = read_libsvm('data/data_madelon/folds/fold2')
    f3Inputs, f3Labels, _ = read_libsvm('data/data_madelon/folds/fold3')
    f4Inputs, f4Labels, _ = read_libsvm('data/data_madelon/folds/fold4')
    f5Inputs, f5Labels, _ = read_libsvm('data/data_madelon/folds/fold5')
    allFoldInputArrays = [f1Inputs.toarray(), f2Inputs.toarray(),
                          f3Inputs.toarray(), f4Inputs.toarray(), f5Inputs.toarray()]
    allFoldLabelArrays = [f1Labels, f2Labels, f3Labels, f4Labels, f5Labels]

    initLearningRates = [10**1, 10**0, 10**-1, 10**-2, 10**-3, 10**-4]
    regularizations = [10**1, 10**0, 10**-1, 10**-2, 10**-3, 10**-4]

    bestLearningRate = None
    bestRegularization = None
    bestAccuracy = 0

    counter = 1
        return 10**-4


if __name__ == '__main__':
    xtrain_s, ytrain_s, numfeat_s, xtest_s, ytest_s = setup_data("s")
    # folds = []
    # folds.append(cvf("s", 10**1))
    # folds.append(cvf("s", 10**0))
    # folds.append(cvf("s",10**-1))
    # folds.append(cvf("s",10**-2))
    # folds.append(cvf("s",10**-3))
    # folds.append(cvf("s",10**-4))
    # best = np.max(folds)
    # hp = best_hp(folds, best)

    xt, yt, _ = libsvm.read_libsvm("../Kaggle/data/data-splits/data.eval.anon")

    clf = AdaBoostClassifier(learning_rate=1, n_estimators=5000)
    clf.fit(xtrain_s, ytrain_s)
    score = clf.score(xtrain_s, ytrain_s)
    print(score)
    clf.fit(xtest_s, ytest_s)
    test_score = clf.score(xtest_s, ytest_s)
    print(test_score)
    labels = clf.predict(xt)


    ids = np.fromfile('../Kaggle/data/data-splits/eval.id', dtype=int, sep="\n")
    with open('adaboost.csv', mode='w+', newline='') as csv_file:
        fieldnames = ['example_id', 'label']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
예제 #18
0
LambdaTerms = [2, 1.5, 1.0, 0.5]
combineFoldsNames = [
    "fold1234", "fold1235", "fold1245", "fold1345", "fold2345"
]
singleFoldNames = ["fold5", "fold4", "fold3", "fold2", "fold1"]
bestaccuracy = 0
bestLambda = 2

for i in range(4):
    lambdaTerm = LambdaTerms[i]
    accuracies = []
    for j in range(5):
        combineFoldName = combineFoldsNames[j]
        singleFoldName = singleFoldNames[j]

        X_train, y_train, num_features = read_libsvm(combineFoldName)
        totalCol = len(X_train.todense()[1].flat)
        totalRows = len(X_train.todense())
        totalOnes = totalPositive(y_train)
        totalOnesProb = totalOnes / len(y_train)
        positiveOnes = []
        negativeOnes = []
        for i in range(totalCol):
            OnesPos = 0
            OnesNeg = 0
            for j in range(totalRows):
                x = X_train.todense()[j].flat[i]
                if x == 1:
                    if y_train[j] == 1:
                        OnesPos = OnesPos + 1
                    else:
def setup():
    X_train, y_train, num_features = libsvm.read_libsvm('./data/data-splits/data.train')
    X_test, y_test, _ = libsvm.read_libsvm('./data/data-splits/data.test', num_features)
    return X_train, y_train, num_features, X_test, y_test
예제 #20
0
import libsvm
import numpy as np
import random
from sklearn import preprocessing
from scipy import sparse
from sklearn.preprocessing import MaxAbsScaler

X_train, y_train, num_features = libsvm.read_libsvm(
    "C:\\Users\\Abhi\\Documents\\MyFiles\\AB\\GradSchool\\Fall19\\ML\\Project"
    "\\data\\data\\data-splits\\data.train")

# log_x = X_train.log1p()
# print(type(log_x))

#print(X_train[1])
#print(log_x[1])

# normalizing the data

# normal_x = preprocessing.normalize(X_train)
# print(normal_x[2])

#min max scaling

scaler = MaxAbsScaler()
scaler.fit(X_train)
X_minmax = scaler.transform(X_train)

# print(X_train[0])
# print(binarized_X[0])