示例#1
0
def main():
    np.random.seed(0)

    data = f.readData()
    train, validation, test = f.splitData(data.shape[0])

    C = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]

    # C = 0.061 F-1 = 0.525
    trainRegularizationStrengthForl2(data, train, validation, C)

    # C = 0.175 F-1 = 0.526170798898
    trainRegularizationStrengthForl1(data, train, validation, C)
def main():
    np.random.seed(0)

    data = f.readData()
    train, validation, test = f.splitData(data.shape[0])

    # trainNeuralNetworks(data, train, validation)

    alphas = np.arange(
        0.0001, 0.0015,
        0.0001)  #[0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01]
    N = [75, 100]
    F_1 = np.zeros([len(alphas), len(N)])
    for i in range(len(alphas)):
        for j in range(len(N)):
            F_1[i, j] = trainNeuralNetworks(data,
                                            train,
                                            validation,
                                            N=N[j],
                                            alpha=alphas[i])
def main():
    np.random.seed(0)

    data = f.readData()
    train, validation, test = f.splitData(data.shape[0])

    # C = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]

    # n_estimators = [5000, 10000, 50000, 100000, 500000]
    # n_estimators = np.arange(10, 200, 10)
    # n_estimators = np.repeat([100],100)

    # number_of_trees = 100, average F-1 on 100 forests = 0.377228139802
    # trainRandomForest(data, train, validation, n_estimators, max_features = None)

    n_estimators = [100, 200, 500, 1000]
    # number_of_boosting_stages = 100, average F-1 on 100 boostings = 0.377228139802
    trainGradientBoosting(data,
                          train,
                          validation,
                          n_estimators,
                          max_features='auto')
示例#4
0
def main():
    np.random.seed(0)

    data = f.readData()
    train, validation, test = f.splitData(data.shape[0])

    # C = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]
    C = [0.000001, 0.000005, 0.00001, 0.00005, 0.0001, 0.0005]
    # C = [10, 50, 100, 500, 1000]
    # C = np.arange(1, 10, 1)

    # C = 137, F-1 = 0.541310541311
    trainSVMWithGaussianKernel(data, train, validation, C)

    trainSVMWithLinearKernel(data, train, validation, C)

    # SVM with Linear Kernel

    # l1, squared hinge, C = 50, F-1 = 0.525447042641
    # l2, hinge, C = 0.001 , F-1 = 0.512968299712
    # l2, squared hinge, C = 1, F-1 = 0.524725274725

    trainSVMWithLinearKernel2(data, train, validation, C)
if fName == 'abalone.data':
    abalonePath = os.path.join(path, fName)
    abaloneNames = [
        'sex', 'length', 'diameter', 'height', 'wholeHt', 'shuckWt',
        'visceraWt', 'shellWt', 'rings'
    ]
    data, features, classNum = prepData(abalonePath, abaloneNames, slice(-1),
                                        'rings')
    classes = classNum.astype(str)
    classes[classNum <= 8] = '1-8'
    classes[classNum >= 11] = '11+'

else:
    sys.exit("No such data set.")

(xvData, xvLabel), xvFolds, pruningSet = splitData(data, classes)
fullErr, prnErr = crossValidate(xvData, xvLabel, pruningSet, xvFolds,
                                printTree)

print("Full Tree - Mean Error: %f" % fullErr.mean())
print("Full Tree - St Dev Error: %f" % fullErr.std())

print("Pruned Tree - Mean Error: %f" % prnErr.mean())
print("Pruned Tree - St Dev Error: %f" % prnErr.std())

tr = TrainDTree(xvData, xvLabel)  # train DTree using full cross-val sample
tr.combineChildNodes()  # combine subtrees with homogeneous classes
if printTree:  # print full tree
    print("\n===full tree===")
    print(tr)
PruneDTree(tr, pruningSet[0], pruningSet[1])  # prune with pruning set
def main():
    np.random.seed(0)

    data = f.readData()
    train, validation, test = f.splitData(data.shape[0])

    X_train, y_train = v.makeMatrix(data, train)
    X_test, y_test = v.makeMatrix(data, test)

    print("Logistic Regression")
    clf = LogisticRegression(C=0.061,
                             class_weight='balanced',
                             max_iter=10000,
                             solver='sag',
                             n_jobs=-1)
    f1 = v.validate(data, X_train, y_train, X_test, y_test, clf)
    print("F-1 measure for Logistic Regression with l2 and C = %s is %s" %
          (0.061, f1))

    clf = LogisticRegression(penalty='l1',
                             C=0.175,
                             class_weight='balanced',
                             max_iter=5000,
                             n_jobs=-1)
    f1 = v.validate(data, X_train, y_train, X_test, y_test, clf)
    print("F-1 measure for Logistic Regression with l1 and C = %s is %s" %
          (0.175, f1))

    print("SVM")
    clf = SVC(C=137, class_weight='balanced')
    f1 = v.validate(data, X_train, y_train, X_test, y_test, clf)
    print("F-1 measure for SVM with RBF and C = %s is %s" % (137, f1))

    # l1, squared hinge, C = 50, F-1 = 0.525447042641
    # l2, hinge, C = 0.001 , F-1 = 0.512968299712
    # l2, squared hinge, C = 1, F-1 = 0.524725274725
    C = 50
    loss = "squared_hinge"
    penalty = 'l1'
    clf = LinearSVC(C=C,
                    loss=loss,
                    penalty=penalty,
                    class_weight='balanced',
                    dual=False)
    f1 = v.validate(data, X_train, y_train, X_test, y_test, clf)
    print(
        "F-1 measure for SVM with Linear Kernal, %s loss, %s penalty, and C = %s is %s"
        % (loss, penalty, C, f1))

    C = 0.001
    loss = "hinge"
    penalty = 'l2'
    clf = LinearSVC(C=C,
                    loss=loss,
                    penalty=penalty,
                    class_weight='balanced',
                    dual=True)
    f1 = v.validate(data, X_train, y_train, X_test, y_test, clf)
    print(
        "F-1 measure for SVM with Linear Kernal, %s loss, %s penalty, and C = %s is %s"
        % (loss, penalty, C, f1))

    C = 1
    loss = "squared_hinge"
    penalty = 'l2'
    clf = LinearSVC(C=C,
                    loss=loss,
                    penalty=penalty,
                    class_weight='balanced',
                    dual=False)
    f1 = v.validate(data, X_train, y_train, X_test, y_test, clf)
    print(
        "F-1 measure for SVM with Linear Kernal, %s loss, %s penalty, and C = %s is %s"
        % (loss, penalty, C, f1))
示例#7
0
TODO: Kann ein feature welches immer 0 ist hohe gewichte im lin fit kriegen?  -> Ja! sehr einfach sogar
		Ändert die Ridge regression das? -> Ja! w[i] ist dann 0
		Dazu entweder einfach ein feature = 0 setzen und lernen oder ein gelerntes modell nehmen und feature X runter und gewicht zu X sehr hoch setzen - sollte die 
		performance wenig belasten wenn das feature X ca. 0 ist.
"""

### data
data = F.loadPreparedData()
### norm each feature to 1 (makes the weights comparable - not necessary with shap, but nice to have)
data = data / data.max(axis=0)
## append sqrt(fare) to better the distribution
data["sqrt(fare)"] = np.sqrt(data["fare"])
## append 0 data colum
data["testZero"] = rnd.random(len(data)) * 0.00001
## test train split
x_train, y_train, x_test, y_test = F.splitData(data)
### correlation
corr = data.corr()["survived"]
F.prettyPrint("Correlation",
              corr.sort_values(key=lambda x: abs(x), ascending=False))

DO_LIN_FIT = 0
DO_NN_FIT = 1
DO_NNLIN_FIT = 0

if DO_LIN_FIT:
    print("-----------------------------------------")
    print(" ######## LIN MODEL ########")
    print("-----------------------------------------")
    ### Linear Model
    linModel, w = F.linRegression(x_train, y_train, x_test, y_test)