예제 #1
0
def compute_SVR(train_x, train_y, test_x):

        # make MAE scoring
        MAE = make_scorer(compute_error, greater_is_better = False)

        ######### SVR - Polynomial/rbf Kernel #########
        # make pipeline
        std_SVR = make_pipeline(StandardScaler(), SVR())
        params = {'svr__kernel': ['poly', 'rbf'], 'svr__degree': [1, 2]}
        gs = GridSearchCV(estimator = std_SVR, param_grid = params, scoring = MAE, n_jobs=-1, cv = 5, return_train_score = True)


	# fit grid search
        gs.fit(train_x, train_y)

        print('SVR train score', -gs.cv_results_['mean_train_score'])
        print('SVR test score', -gs.cv_results_['mean_test_score'])
        print('Best Parameter', gs.best_params_)
        print('Best score', -gs.best_score_)
        print('Parameters', gs.cv_results_['params'])
        
        # Train the best Model
        best_SVR = make_pipeline(StandardScaler(), SVR(kernel='poly', degree=1))
        best_SVR.fit(train_x, train_y)

        # Make Prediction
        test_y = best_SVR.predict(test_x)
        # Create test output values
        predicted_y = test_y * -1
        # Output file location
        file_name = '../Predictions/SVR_best.csv'

        # Writing output in Kaggle format
        print('Writing output to ', file_name)
        kaggle.kaggleize(predicted_y, file_name)
예제 #2
0
def trainer(model,feature_selection1,train_data,test_data, data_set,split):
    #features with low variance removed using formula Var(x)=p(1-p). Here, the equation is looking for features with variance below 80%
    if feature_selection1:    
        X_all=np.concatenate((train_data[:,1:],test_data[:,1:]),axis=0)
        sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
        X_red=sel.fit_transform(X_all)
        X_learn=X_red[:(len(X_red)/2)]
        X_test=X_red[(len(X_red)/2):]
        Y_learn=train_data[:,0]
        X_learnS, Y_learnS=shuffle(X_learn,Y_learn)
    else:
        X_learnS, Y_learnS=shuffle(train_data[:,1:],train_data[:,0])
        X_test=test_data[:,1:]
    #best_model: trained model using optimized hyperparameters; errorF: Mean training error; c1: optimized n_estimators;     
    #c2: optimized max_features; c3: optimized min_samples_leaf; e1: array of training errors for n_estimators; e2: array of training errors for max_features
    #e3: array of training errors for min_samples_leaf
    if model=='RF':
        best_model, errorF, c1, c2, c3,e1,e2,e3=cv_cv(X_learnS, Y_learnS, model, split)
        print model,"for", data_set, "- Mean Training Error, optimized n_estimators,max_features,min_samples_leaf: ", errorF, c1, c2,c3
        plot_line(range(1,20),e1,'n_estimators')
        plot_line(np.arange(0.1,1.1,0.1),e2,'max_features')
        plot_line(range(1,10),e3,'min_samples_leaf')
    elif model=='DT':
        best_model, errorF, c1, c2,e1,e2=cv_cv(X_learnS, Y_learnS, model, split)
        print model,"for", data_set, "- Mean Training Error, optimized min_samples_leaf, max_depth: ", errorF, c1, c2
    else: 
        best_model, errorF, c1, c2,e1,e2=cv_cv(X_learnS, Y_learnS, model, split)
        print model,"for", data_set, "- Mean Training Error, optimized p, n_neighbors: ", errorF, c1, c2

    reel=best_model.predict(X_test)
    # 
    # #Save prediction file in Kaggle format
    predictions = reel
    kaggle.kaggleize(predictions, "../Predictions/"+data_set+"/test.csv")
예제 #3
0
    def executeTrainDT(self, data, kfold, depthLst, fileTestOutputDT):
        trainX = data[0]
        trainY = data[1]
        testX = data[2]

        tree_para = {'criterion': ['gini'], 'max_depth': depthLst}
        clf = GridSearchCV(DecisionTreeClassifier(),
                           tree_para,
                           cv=kfold,
                           n_jobs=12)
        clf.fit(trainX, trainY)
        meanTestAccuracy = clf.cv_results_['mean_test_score']

        bestPara = clf.best_estimator_
        print("DT cvResult : ", bestPara.max_depth, 1.0 - meanTestAccuracy)

        kwargs = {'criterion': 'gini', 'max_depth': bestPara.max_depth}
        predY = self.trainTestWholeData(trainX, trainY, testX,
                                        DecisionTreeClassifier, kwargs)
        #print ("predY DT: ", predY)
        #output to file
        if fileTestOutputDT != "":
            kaggle.kaggleize(predY, fileTestOutputDT)

        return (min(1.0 - meanTestAccuracy), kfold, bestPara.max_depth)
예제 #4
0
def indoor_localization_best():
    train_x, train_y, test_x = read_data_localization_indoors()
    print('Train=', train_x.shape)
    print('Test=', test_x.shape)
    print('Best model for Indoor Localization Dataset')
    parameter = 9
    print("Training with full train data with Model: KNN for n=" +
          str(parameter))
    neigh = KNeighborsRegressor(n_neighbors=parameter)
    neigh = neigh.fit(train_x, train_y)
    y_hat = neigh.predict(train_x)
    e = compute_error(y_hat, train_y)
    print("MAE train error=" + str(e))
    y_pred = neigh.predict(test_x)
    # Create dummy test output values
    predicted_y = y_pred
    # Output file location
    file_name = '../Predictions/IndoorLocalization/best.csv'
    # Writing output in Kaggle format
    print('Writing output to ', file_name)
    kaggle.kaggleize(predicted_y, file_name)

    error_mat[:] = []
    time_mat[:] = []
    min_e = 10000000
    min_n = 3

    print('\n\n')
예제 #5
0
def testDataOutputFile(weights, test_x, unflatten, fileTestOutput):
    (W, b, V, c) = unflatten(weights)
    out = feedForward(W, b, V, c, test_x)
    predY = np.argmax(out, axis=1)
    #output to file
    if fileTestOutput != "":
        kaggle.kaggleize(predY, fileTestOutput)
예제 #6
0
    def executeTrainLinearReg(self, data, kfold, alphaLst, fileTestOutputDT):
        trainX = data[0]
        trainY = data[1]
        testX = data[2]

        logReg_para = {'loss': ['hinge', 'log'], 'alpha': alphaLst}
        clf = GridSearchCV(linear_model.SGDClassifier(),
                           logReg_para,
                           cv=kfold,
                           n_jobs=12)
        clf.fit(trainX, trainY)
        meanTestAccuracy = clf.cv_results_['mean_test_score']

        bestPara = clf.best_estimator_
        print("logistic Regreesion cvResult : ", bestPara.loss, bestPara.alpha,
              1.0 - meanTestAccuracy)

        kwargs = {'loss': 'hinge', 'alpha': bestPara.alpha}
        predY = self.trainTestWholeData(trainX, trainY, testX,
                                        linear_model.SGDClassifier, kwargs)
        #print ("predY DT: ", predY)
        #output to file
        if fileTestOutputDT != "":
            kaggle.kaggleize(predY, fileTestOutputDT + 'hinge')

        kwargs = {'loss': 'log', 'alpha': bestPara.alpha}
        predY = self.trainTestWholeData(trainX, trainY, testX,
                                        linear_model.SGDClassifier, kwargs)
        #print ("predY DT: ", predY)
        #output to file
        if fileTestOutputDT != "":
            kaggle.kaggleize(predY, fileTestOutputDT + 'log')

        return (min(1.0 - meanTestAccuracy), kfold, bestPara.alpha)
예제 #7
0
def distance_effect(train_x, train_y, test_x):
    regr = KNeighborsRegressor(n_neighbors=3, p=1)
    # Fit Model
    regr.fit(train_x, train_y)

    # Make Prediction
    test_y = regr.predict(test_x)

    # Create test output values
    predicted_y = test_y * -1

    # Output file location
    file_name = '../Predictions/KNN_Manhattan.csv'

    # Writing output in Kaggle format
    print('Writing output to ', file_name)
    kaggle.kaggleize(predicted_y, file_name)

    regr = KNeighborsRegressor(n_neighbors=3, metric='chebyshev')
    # Fit Model
    regr.fit(train_x, train_y)

    # Make Prediction
    test_y = regr.predict(test_x)

    # Create test output values
    predicted_y = test_y * -1

    # Output file location
    file_name = '../Predictions/KNN_chebyshev.csv'

    # Writing output in Kaggle format
    print('Writing output to ', file_name)
    kaggle.kaggleize(predicted_y, file_name)
예제 #8
0
def power_plant_best():
    train_x, train_y, test_x = read_data_power_plant()
    print('Train=', train_x.shape)
    print('Test=', test_x.shape)
    print('Best model for Power Output Dataset')
    parameter = 13
    print("Training with full train data: Model=Decision Tree for depth=" +
          str(parameter))
    clf = tree.DecisionTreeRegressor(criterion='mse', max_depth=parameter)
    clf = clf.fit(train_x, train_y)
    y_hat = clf.predict(train_x)
    e = compute_error(y_hat, train_y)
    print("MAE train error=" + str(e))
    y_pred = clf.predict(test_x)
    # Create dummy test output values
    predicted_y = y_pred
    # Output file location
    file_name = '../Predictions/PowerOutput/best.csv'
    # Writing output in Kaggle format
    print('Writing output to ', file_name)
    kaggle.kaggleize(predicted_y, file_name)

    error_mat[:] = []
    time_mat[:] = []
    min_e = 10000000
    depth_for_min_e = 3

    print('\n\n')
예제 #9
0
def trainSVMExtra(fileTestOutput, resultFile):
    '''
    for extra credit 1
    try different kernel ridge model or even different kernel
    how to select effective kernel
    '''

    train_x, train_y, test_x = read_tumor_data()
    print('Train=', train_x.shape, type(train_x))
    print('Test=', test_x.shape)

    #[1, 0.01, 0.001, 0.0001]
    fd = open(resultFile, 'a')

    kfoldLst = range(3, 12)
    biggestAccuracy = -2**32

    for kfold in kfoldLst:
        parameters = {
            'kernel': ('linear', 'rbf', 'sigmoid', 'poly'),
            'C': np.linspace(1, 5, 5),
            'gamma': [0.001, 0.1, 20],
            'degree': np.linspace(1, 5, 5)
        }

        clf = GridSearchCV(SVC(), parameters, cv=kfold,
                           n_jobs=10)  #scoring= "neg_mean_squared_error" )
        clf.fit(train_x, train_y)
        meanTestError = clf.cv_results_['mean_test_score']
        bestPara = clf.best_estimator_

        if clf.best_score_ > biggestAccuracy:
            biggestAccuracy = clf.best_score_
            paramtersBest = [
                bestPara.C, bestPara.gamma, bestPara.degree, bestPara.kernel,
                kfold, clf.best_score_
            ]

        #print ("trainKernelRidgeExtra Result : ", bestPara.C, bestPara.gamma, bestPara.degree,  bestPara.kernel, clf.best_score_, meanTestError,)
        writeToFile(fd, [
            bestPara.C, bestPara.gamma, bestPara.degree, bestPara.kernel,
            kfold, clf.best_score_
        ] + list([meanTestError]))
        # kwargs = {'n_neighbors': bestPara.n_neighbors}

        clf = SVC(C=bestPara.C,
                  gamma=bestPara.gamma,
                  degree=bestPara.degree,
                  kernel=bestPara.kernel)
        clf.fit(train_x, train_y)
        predY = clf.predict(test_x)

        #print ("predY DT: ", predY)
        #output to file
        if fileTestOutput != "":
            kaggle.kaggleize(predY, fileTestOutput + str(kfold), False)

    print("best final trainKernelRidgeExtra Result: ", paramtersBest)
def stratifyDataTrainTest3layerNN():
    data = read_image_data()
    train_x = data[0]
    train_y_integers = data[1]
    test_x = data[2]

    #normalize
    scaler = StandardScaler()
    train_x = scaler.fit_transform(train_x)
    test_x = scaler.fit_transform(test_x)

    print("train_x. shape:", train_x.shape)

    #split
    xsplitTrain, xsplitTest, ysplitTrain_integer, ysplitTest_integer = train_test_split(
        train_x,
        train_y_integers,
        test_size=0.2,
        random_state=0,
        stratify=train_y_integers)

    hidden_layer_sizes_lst = [(5, 5, 5), (10, 10, 10), (40, 40, 40),
                              (70, 70, 70), (100, 100, 100)]

    largestAccuracy = -2**32
    best_hidden_layer_size = None
    for hidden_layer_sizes in hidden_layer_sizes_lst:
        beginTime = time.time()
        mlp = MLPClassifier(hidden_layer_sizes,
                            activation='tanh',
                            max_iter=1000,
                            momentum=0.9,
                            epsilon=1e-8)
        mlp.fit(xsplitTrain, ysplitTrain_integer)
        #pred = mlp.predict(xsplitTest)              #predict validation set
        meanAccuracy = mlp.score(xsplitTest, ysplitTest_integer)
        if meanAccuracy > largestAccuracy:
            largestAccuracy = meanAccuracy
            best_hidden_layer_size = hidden_layer_sizes

        print("SstratifyDataTrainTest3layerNN. smalleAccuracy:",
              time.time() - beginTime, meanAccuracy)

    print("SstratifyDataTrainTest3layerNN. smalleAccuracy:", largestAccuracy,
          best_hidden_layer_size)
    #train and test the whole data
    mlp = MLPClassifier(best_hidden_layer_size,
                        activation='tanh',
                        max_iter=1000,
                        momentum=0.9,
                        epsilon=1e-8)
    mlp.fit(train_x, train_y_integers)
    predyTest = mlp.predict(test_x)

    #output to file
    fileTestOutput3LayerNN = "../Predictions/best_3HiddenNN.csv"
    if fileTestOutput3LayerNN != "":
        kaggle.kaggleize(predyTest, fileTestOutput3LayerNN)
예제 #11
0
def kFold(features_train, labels_train, features_test, labels_test, path,
          DataSet):
    n = features_train.shape
    k = 10  # make it 10 fold test
    size = n[0] / k  # size of each fold.
    errors1 = {}  # to save CrossValidation Errors
    errors2 = {}  # to save Training Errors
    e1 = []  # to obtain mean square error of whole CrossValidation set
    e2 = []  #to obtain mean square error of whole Training set
    for p in range(1, 20):  # considering neighbours from range 1 to 20.
        print "Considering Neighbor=" + str(p)
        for i in range(1, k):
            # Select the cross Validation set, it will change with changing values of i
            Feature_CrossVal = features_train[i * size:][:size]
            Label_CrossVal = labels_train[i * size:][:size]

            # Add the rest of Training set
            Feature_Train = features_train[:i * size]
            np.append(Feature_Train, features_train[(i + 1) * size:])
            Label_Train = labels_train[:i * size]
            np.append(Label_Train, labels_train[(i + 1) * size:])

            # K Nearest Neighbor training with testset and testing with Cross Validation set
            neigh = KNeighborsClassifier(n_neighbors=p)
            neigh.fit(Feature_Train, Label_Train)
            predict = np.zeros(Label_CrossVal.shape)
            predict = neigh.predict(Feature_CrossVal)

            # K Nearest Neighbor training and testing on same set
            ne = KNeighborsClassifier(n_neighbors=p)
            ne.fit(Feature_Train, Label_Train)
            pre = np.zeros(Label_Train.shape)
            pre = ne.predict(Feature_Train)

            # Find the mean square error between the found predictions to the cross validation output set add error in a list
            e1.append(mean_squared_error(Label_CrossVal, predict))
            e2.append(mean_squared_error(Label_Train, pre))
            print e1, e2  # printing so that it doesn't look like program got hanged :P

        # take mean of the K fold errors for particular neighbor and append it
        errors1[p] = np.mean(e1)
        errors2[p] = np.mean(e2)

    #Find predictions with Min Error Neighbor
    neigh = KNeighborsClassifier(n_neighbors=min(errors1, key=errors1.get))
    #fitting the training data
    neigh.fit(features_train, labels_train)
    predict = np.zeros(labels_test.shape)
    #testing with test features
    predict = neigh.predict(features_test)
    kaggle.kaggleize(
        predict,
        str(path) + "/Submission/Predictions/" + str(DataSet) +
        "/CrossValidation_KNN.csv")
    #returning errors to run_me.py file for plotting CrossValidation and Training Error

    return errors1, errors2
예제 #12
0
def kernelRidgeSkLearnCV(kfold=8, fileTestOutput="best_cv"):
    '''
    call kernel ridge functions for different parameters for credit card activity data
    use cross validation to get out of sample out of sample meansquared error
    '''

    train_x, train_y, test_x = read_creditcard_data()
    print('Train=', train_x.shape, type(train_x))
    print('Test=', test_x.shape)

    #train_x = normalize(train_x, axis=0)
    #test_x = normalize(test_x, axis=0)

    #train_x =  StandardScaler().fit_transform(train_x)
    #test_x =  StandardScaler().fit_transform(test_x)

    alphaParaLst = [1, 0.0001]

    gammaParaLst = [None, 1, 0.001]

    kernelParaLst = ["rbf", "polynomial", "linear"]

    mseErrorSmallest = 2**32
    #mseErrorLst = []
    for alpha in alphaParaLst:
        for gamma in gammaParaLst:
            for kernel in kernelParaLst:
                clf = KernelRidge(alpha=alpha,
                                  gamma=gamma,
                                  degree=3,
                                  kernel=kernel)
                mseError = -1 * np.mean(
                    cross_val_score(clf,
                                    train_x,
                                    train_y,
                                    cv=kfold,
                                    scoring="neg_mean_squared_error"))
                print("mseError: ", alpha, gamma, kernel, mseError)
                #mseErrorLst.append(mseError)
                if mseError < mseErrorSmallest:
                    mseErrorSmallest = mseError
                    paramtersBest = [alpha, gamma, kernel]

    print("best mseError: ", kfold, paramtersBest, mseErrorSmallest)

    #train whole data
    clf = KernelRidge(alpha=paramtersBest[0],
                      gamma=paramtersBest[1],
                      degree=3,
                      kernel=paramtersBest[2])
    clf.fit(train_x, train_y)
    yPred = clf.predict(test_x)

    #output file
    if fileTestOutput != "":
        kaggle.kaggleize(yPred, fileTestOutput, True)
예제 #13
0
def compute_KNN(train_x, train_y, test_x):
    # Different number of neighbors to run
    neighbor = [3, 5, 10, 20, 25]
    # Initialize Variables
    train_score = [0] * len(neighbor)
    test_score = [0] * len(neighbor)
    fit_time = [0] * len(neighbor)
    score_time = [0] * len(neighbor)

    # make MAE scoring
    MAE = make_scorer(compute_error, greater_is_better=False)

    # indexing
    index = 0
    for n in neighbor:
        # Create the model
        regr = KNeighborsRegressor(n_neighbors=n)
        # Cross Validation
        cv_score = cross_validate(regr,
                                  train_x,
                                  train_y,
                                  return_train_score=True,
                                  scoring=MAE,
                                  cv=2)
        # Extract Statistics, scorer negates compute_error output
        train_score[index] = -cv_score['train_score'].mean()
        test_score[index] = -cv_score['test_score'].mean()

        # Print Statistics
        print('Number of Neighbors:', n)
        print('train score', train_score[index])
        print('test score', test_score[index])
        print('===============================')

        # Fit Model
        regr.fit(train_x, train_y)

        # Make Prediction
        test_y = regr.predict(test_x)

        # Create test output values
        predicted_y = test_y * -1

        # Output file location
        file_name = '../Predictions/KNearestN_Neighbors_%d.csv' % n

        # Writing output in Kaggle format
        print('Writing output to ', file_name)
        kaggle.kaggleize(predicted_y, file_name)

        # Increase indexing
        index = index + 1
예제 #14
0
def predictNextBallBayesInference(file_name):
    iterations = np.arange(10000, 1000000, 10000) #[100000] #np.arange(10000, 1000000, 10000)
    for iters in iterations:
        prediction_prob = list()
        lengths = [10, 15, 20, 25]
        for l in lengths:
            BArray = np.loadtxt('../../Data/B_sequences_%s.txt' % (l), delimiter=',', dtype=float)
            for b in np.arange(BArray.shape[0]):
                prob = getNextBallMCMCQuestionl(BArray[b, :], iters)
                prediction_prob.append(prob)
                #print('Prob of next entry in ', BArray[b, :], 'is black is', prediction_prob[-1])
                #print('Prob of next entry in is black is', prediction_prob[-1])
        print('Writing output to ', file_name + "_iteration_" + str(iters))
        kaggle.kaggleize(np.array(prediction_prob), file_name + "iteration_" + str(iters))
예제 #15
0
def svmSklearnCV(kfold=7, fileTestOutput="best_cv"):
    '''
    call svm train and predict for different parameters for tumor data
    use cross validation to get out of sample out of sample meansquared error
    '''

    train_x, train_y, test_x = read_tumor_data()
    print('Train=', train_x.shape)
    print('Test=', test_x.shape)

    cLst = [1, 0.01, 0.0001]
    gammaLst = [1, 0.01, 0.001]
    kernelParaLst = ['rbf', 'poly=3', 'poly=5', 'linear']
    degree = 3

    accuracyLargest = -2**32
    for c in cLst:
        for gamma in gammaLst:
            for kernel in kernelParaLst:
                if kernel.split("=")[0] == "poly":
                    degree = int(kernel.split("=")[1])
                    kernel = kernel.split("=")[0]
                clf = SVC(C=c, kernel=kernel, degree=degree, gamma=gamma)

                accuracy = np.mean(
                    cross_val_score(clf,
                                    train_x,
                                    train_y,
                                    cv=kfold,
                                    scoring="accuracy"))
                print("accuracy: ", c, gamma, kernel, degree, accuracy)
                if accuracy > accuracyLargest:
                    accuracyLargest = accuracy
                    paramtersBest = [c, gamma, degree, kernel]

    print("best accuracy parameters: ", kfold, paramtersBest, accuracyLargest)

    #train whole data
    clf = SVC(C=paramtersBest[0],
              gamma=paramtersBest[1],
              degree=paramtersBest[2],
              kernel=paramtersBest[3])
    clf.fit(train_x, train_y)
    yPred = clf.predict(test_x)

    #output file
    if fileTestOutput != "":
        kaggle.kaggleize(yPred, fileTestOutput, False)
예제 #16
0
def power_plant_LinearModel(k_fold):
    train_x, train_y, test_x = read_data_power_plant()
    print('Train=', train_x.shape)
    print('Test=', test_x.shape)
    alphas = [pow(10, -6), pow(10, -4), pow(10, -2), 1, 10]
    y_pred = choose_best_LinearModel(train_x, train_y, test_x, alphas, k_fold)

    #####plot
    avg_time = []
    log_alphas = []
    for i in time_mat:
        avg_time.append(i[k_fold])
    for i in alphas:
        log_alphas.append(math.log10(i))
    plt.plot(log_alphas, avg_time[0:len(alphas)], 'ro', label='Lasso')
    plt.plot(log_alphas,
             avg_time[len(alphas):2 * len(alphas)],
             'bo',
             label='Ridge')
    plt.legend(loc='upper center')
    plt.ylabel('Avg Time for validation')
    plt.xlabel('Alpha for Linear Model (Log)')
    plt.title('Power Plant dataset (Model=Linear Model)')
    plt.show()

    predicted_y = y_pred
    file_name = '../Predictions/PowerOutput/power_plant_LM.csv'
    print('Writing output to ', file_name)
    kaggle.kaggleize(predicted_y, file_name)

    file2 = open('PPlogfile_LM.txt', 'w')
    file2.write(str(error_mat) + '\n\n\n')
    file2.write(str(time_mat))
    file2.close()

    #clear global variables
    error_mat[:] = []
    time_mat[:] = []
    alpha_for_min_e = 3
    flag = True
    min_e = 100000000
    avg_time[:] = []
    log_alphas[:] = []

    print('\n\n')
예제 #17
0
def compute_NN(train_x, train_y, test_x):

    # make MAE scoring
    MAE = make_scorer(compute_error, greater_is_better=False)

    ######### Neural Network #########
    # make pipeline
    std_NN = make_pipeline(StandardScaler(), MLPRegressor())
    params = {
        'mlpregressor__hidden_layer_sizes': [(10, ), (20, ), (30, ), (40, )],
        'mlpregressor__max_iter': [1000]
    }
    gs = GridSearchCV(estimator=std_NN,
                      param_grid=params,
                      scoring=MAE,
                      n_jobs=-1,
                      cv=5,
                      return_train_score=True)

    # fit grid search
    gs.fit(train_x, train_y)

    print('NN train score', -gs.cv_results_['mean_train_score'])
    print('NN test score', -gs.cv_results_['mean_test_score'])
    print('Best Parameter', gs.best_params_)
    print('Best score', -gs.best_score_)
    print('Parameters', gs.cv_results_['params'])

    # Train the best Model
    best_NN = make_pipeline(StandardScaler(),
                            MLPRegressor(hidden_layer_sizes=(20, )))
    best_NN.fit(train_x, train_y)

    # Make Prediction
    test_y = best_NN.predict(test_x)
    # Create test output values
    predicted_y = test_y * -1
    # Output file location
    file_name = '../Predictions/NN_best.csv'

    # Writing output in Kaggle format
    print('Writing output to ', file_name)
    kaggle.kaggleize(predicted_y, file_name)
def classifierKnn(train1, label1, train2, label2):
    metric = 'euclidean'
    k = 2

    clf = neighbors.KNeighborsClassifier(n_neighbors=k,
                                         weights='distance',
                                         metric=metric)
    clf.fit(features_train, labels_train)

    #Compute a prediction for every point in the grid

    for i in range(len(train2)):
        x = features_test[i, :].reshape(1, -1)
        predictions = clf.predict(x)
        labels_test[i] = predictions
    #Save prediction file in Kaggle format
    #predictions = np.zeros(labels_test.shape)
    for i in range(len(labels_test)):
        print features_test[i, :], labels_test[ssi]

    kaggle.kaggleize(labels_test, "../Predictions/Digits/test.csv")
예제 #19
0
def pipe_line_final(path, param_grid, nfs, data_set):
    if data_set=='Blog':
        train_m = np.load(path + 'train.npy')
        X_all=train_m[:,0:train_m.shape[1]-1]
        Y_all=train_m[:,-1]
        f=SelectKBest(f_regression)
        X_new=f.fit_transform(X_all, Y_all)
        train_x, train_y=shuffle(X_new, Y_all)
        selected_params=f.get_support(indices=True)
        
       
    
        
        reg=GridSearchCV(GradientBoostingRegressor(), param_grid, scoring='neg_mean_squared_error')
        reg.fit(train_x,train_y)
        
        print "For ", data_set, "- best params: ", reg.best_params_, "Feature_subset: ", f.get_support(indices=True)
        
        test=np.load(path + 'test_distribute.npy')
        test_x=test[:, 0:test.shape[1]-1]
        tr=[]
        for i in range(0, 280):
            if i in selected_params:
                tr.append(True)
            else: tr.append(False) 
        tr=np.array(tr)
        X_masked=test_x[:,tr]
        predictions=reg.predict(X_masked)
        kaggle.kaggleize(predictions, "../Predictions/BlogFeedback/test.csv")
    
    else:
        train = np.load(path + 'train.npy')
        test = np.load(path + 'test_private.npy')
        train_x, train_y=shuffle(train[:, 0:train.shape[1]-1], train[:, -1])
        test_x = test[:, 0:test.shape[1]-1]
        test_y = test[:, -1]
        rfe=RFE(DecisionTreeRegressor(), n_features_to_select=nfs, step=2)
        reg=GridSearchCV(rfe, param_grid)
        reg.fit(train_x,train_y)
        print "For ", data_set, "- best params: ", reg.best_params_, "Feature_subset: ", reg.best_estimator_.ranking_, "RMSE: ", np.sqrt(mean_squared_error(test_y,reg.predict(test_x)))
def Experiment_DataSet(features_train, labels_train, features_test,
                       labels_test, path, i):

    #Support Vector Regression
    svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
    y_rbf = svr_rbf.fit(features_train, labels_train).predict(features_test)
    kaggle.kaggleize(
        y_rbf, path + "Submission/Predictions/" + str(i) + "/RFE_SVR.csv")
    print "Support Vector Regression"

    #K Nearest Neighbor
    neigh = KNeighborsRegressor(n_neighbors=6)
    neigh.fit(features_train, labels_train)
    predict = neigh.predict(features_test)
    kaggle.kaggleize(
        predict, path + "Submission/Predictions/" + str(i) + "/KNN_REG.csv")
    print "K Nearest Neighbor"

    #LassoCV with RFE and pipeline merge of both
    alpha = np.arange(0.1, 2, 0.1)  #defining range of alphas
    lasso = linear_model.LassoCV(alphas=alpha)
    rfe = RFE(estimator=lasso, step=1)
    Lasso_Pipeline = make_pipeline(rfe, svr_rbf)
    Lasso_Pipeline.fit(features_train, labels_train)
    predict = Lasso_Pipeline.predict(features_test)
    kaggle.kaggleize(
        predict, path + "Submission/Predictions/" + str(i) + "/LassoCV.csv")
    print "LassoCV with RFE and pipeline"
def final_model(train_x, train_y, test_x):
    np.random.seed(2018)
    # make MAE scoring
    MAE = make_scorer(compute_error, greater_is_better=False)

    scaler = StandardScaler()
    scaler.fit(train_x)
    train_x = scaler.transform(train_x)
    test_x = scaler.transform(test_x)

    # create model
    model = Sequential()
    model.add(Dense(20, input_shape=(52, ), activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(20, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(20, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(1, activation='relu'))

    opti_adam = optimizers.Adam(lr=0.1, beta_1=0.9)
    # Compile model
    model.compile(loss='MAE', optimizer=opti_adam, metrics=['accuracy'])
    # Fit the model
    model.fit(train_x, train_y, epochs=150, batch_size=200)
    # evaluate the model
    scores = model.evaluate(train_x, train_y)
    print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))

    test_y = model.predict(test_x)

    predicted_y = test_y * -1
    # Output file location
    file_name = '../Predictions/NN_best_competition.csv'

    # Writing output in Kaggle format
    print('Writing output to ', file_name)
    kaggle.kaggleize(predicted_y, file_name)
예제 #22
0
def pipe_line(path, test_y_p, param_grid, nfs, data_set):
    if test_y_p:
        train = np.load(path + 'train.npy')
        test = np.load(path + 'test_private.npy')
    else:
        train_m = np.load(path + 'train.npy')
        train=train_m[:(len(train_m)/2)]
        test = train_m[(len(train_m)/2):]
        
        
        
        
    train_x, train_y=shuffle(train[:, 0:train.shape[1]-1], train[:, -1])
    test_x = test[:, 0:test.shape[1]-1]
    test_y = test[:, -1]
    rfe=RFE(DecisionTreeRegressor(), n_features_to_select=nfs, step=2)
    reg=GridSearchCV(rfe, param_grid)
    reg.fit(train_x,train_y)
    print "For ", data_set, "- best params: ", reg.best_params_, "Feature_subset: ", reg.best_estimator_.ranking_, "RMSE: ", np.sqrt(mean_squared_error(test_y,reg.predict(test_x)))
    if not test_y_p:
        test_f=np.load(path + 'test_distribute.npy')
        predictions=reg.predict(test_f[:, 0:test.shape[1]-1])
        kaggle.kaggleize(predictions, "../Predictions/BlogFeedback/test.csv")
예제 #23
0
def indoor_localization_KNN():
    train_x, train_y, test_x = read_data_localization_indoors()
    print('Train=', train_x.shape)
    print('Test=', test_x.shape)
    k_fold = 5
    neighbour_list = [3, 5, 10, 20, 25]
    y_pred = choose_best_KNN(train_x, train_y, test_x, k_fold, neighbour_list)
    # Create dummy test output values
    predicted_y = y_pred
    # Output file location
    file_name = '../Predictions/IndoorLocalization/indoor_localization_KNN.csv'
    # Writing output in Kaggle format
    print('Writing output to ', file_name)
    kaggle.kaggleize(predicted_y, file_name)

    file2 = open('ILlogfile_KNN.txt', 'w')
    file2.write(str(error_mat) + '\n')
    file2.write(str(time_mat))
    file2.close()

    #######plot
    avg_time = []
    for i in time_mat:
        avg_time.append(i[k_fold])
    graph = plt.plot(neighbour_list, avg_time, 'rs')
    plt.ylabel('Avg Time for Validation(ms)')
    plt.xlabel('NUmber of neighbours')
    plt.title('Indoor Localization dataset (Model=KNN)')
    plt.show()

    error_mat[:] = []
    time_mat[:] = []
    min_e = 10000000
    min_n = 3
    avg_time[:] = []

    print('\n\n')
예제 #24
0
def power_plant_DT(k_fold):
    train_x, train_y, test_x = read_data_power_plant()
    print('Train=', train_x.shape)
    print('Test=', test_x.shape)
    depths = [3, 6, 9, 12, 15]
    y_pred = choose_best_DT(train_x, train_y, test_x, depths, k_fold)

    #######plot
    avg_time = []
    for i in time_mat:
        avg_time.append(i[k_fold])
    graph = plt.plot(depths, avg_time, 'rs')
    plt.ylabel('Avg Time for Validation(ms)')
    plt.xlabel('Depth of Tree')
    plt.title('Power plant dataset (Model=Decision Tree)')
    plt.show()

    ######writing predictions to CSV file
    predicted_y = y_pred
    file_name = '../Predictions/PowerOutput/power_plant_DT.csv'
    print('Writing output to ', file_name)
    kaggle.kaggleize(predicted_y, file_name)

    #######logs
    file2 = open('PPlogfile_DT.txt', 'w')
    file2.write(str(error_mat) + '\n\n')
    file2.write(str(time_mat) + '\n\n')
    file2.write(str(avg_time) + '\n')
    file2.close()

    error_mat[:] = []
    time_mat[:] = []
    avg_time[:] = []
    min_e = 10000000
    depth_for_min_e = 3

    print('\n\n')
예제 #25
0
def indoor_localization_DT(k_fold):
    train_x, train_y, test_x = read_data_localization_indoors()
    print('Train=', train_x.shape)
    print('Test=', test_x.shape)
    depths = [3, 6, 9, 12, 15]
    y_pred = choose_best_DT(train_x, train_y, test_x, depths, k_fold)

    #####plot avg time
    avg_time = []
    for i in time_mat:
        avg_time.append(i[k_fold])
    plt.plot(depths, avg_time, 'bo')
    plt.ylabel('Avg Time for Validation')
    plt.xlabel('Depth of Tree')
    plt.title('Indoor Localisation Dataset')
    plt.show()

    #####write predictions in CSV file
    predicted_y = y_pred
    file_name = '../Predictions/IndoorLocalization/indoor_localization_DT.csv'
    print('Writing output to ', file_name)
    kaggle.kaggleize(predicted_y, file_name)

    #####log file
    file2 = open('ILlogfile_DT.txt', 'w')
    file2.write(str(error_mat) + '\n\n')
    file2.write(str(time_mat) + '\n\n')
    file2.write(str(avg_time) + '\n')
    file2.close()

    error_mat[:] = []
    time_mat[:] = []
    avg_time[:] = []
    min_e = 10000000
    depth_for_min_e = 3

    print('\n\n')
예제 #26
0
    def executeTrainKNN(self, data, kfold, knnLst, fileTestOutputDT):
        trainX = data[
            0]  #[0:1000, : ]                #smaller first for debugging
        trainY = data[1]  #[0:1000]
        testX = data[2]

        knn_para = {'n_neighbors': knnLst}
        clf = GridSearchCV(KNeighborsClassifier(),
                           knn_para,
                           cv=kfold,
                           n_jobs=12)
        clf.fit(trainX, trainY)
        meanTestAccuracy = clf.cv_results_['mean_test_score']

        bestPara = clf.best_estimator_
        print("KNN cvResult : ", bestPara.n_neighbors, 1.0 - meanTestAccuracy)

        kwargs = {'n_neighbors': bestPara.n_neighbors}
        predY = self.trainTestWholeData(trainX, trainY, testX,
                                        KNeighborsClassifier, kwargs)
        #print ("predY DT: ", predY)
        #output to file
        if fileTestOutputDT != "":
            kaggle.kaggleize(predY, fileTestOutputDT)
def CrossValidation_Robot(features_train,labels_train,features_test,labels_test,path):
    n=features_train.shape
    k=2 # make it 2 fold 
    size = n[0]/k # size of each fold.
    errors1={}
    e1=[]
    errors2={}
    Cvalue=np.arange(100,500,100)
    degree=np.arange(1,n[1],2)
    for d in degree:
        for p in Cvalue:          # considering neighbours from range 1 to 20.
            print "Considering"+str(p)
            for i in range(1,k): 
                # Select the cross Validation set, it will change with changing values of i       
                Feature_CrossVal = features_train[i*size:][:size]  
                Label_CrossVal = labels_train[i*size:][:size]   
                
                # Add the rest of Training set  
                Feature_Train = features_train[:i*size]
                np.append(Feature_Train ,features_train[(i+1)*size:])  
                Label_Train = labels_train[:i*size]
                np.append(Label_Train ,labels_train[(i+1)*size:])
                #define Support Vector Regression with Degree and Regression Coefficient as Hyperparameter
                svr_rbf = SVR(kernel='rbf', C=p, degree=d)
                y_rbf = svr_rbf.fit(Feature_Train, Label_Train).predict(Feature_CrossVal)
                # Find the mean square error between the found predictions to the cross validation output set add error in a list 
                e1.append(np.sqrt(mean_squared_error(Label_CrossVal,y_rbf)))      
                print e1 
                # take mean of the K fold errors for particular neighbor and append it   
            errors1[p]=np.mean(e1) #saving absolute error with Value of regularization constant
        errors2[d]=np.mean(e1)     #saving absolute error with Value of Polynomial Degree
        
    #Find predictions with Min Error degree and Regression Coefficient
    print min(errors1, key=errors1.get)
    #define the Support Vector Regression  with optimum Regression Coefficient and Degree
    svr_rbf = SVR(kernel='rbf', C=min(errors1, key=errors1.get), degree=min(errors1, key=errors2.get))
    predict=np.zeros(labels_test.shape)
    #fit the data and predict the values
    predict = svr_rbf.fit(features_train, labels_train).predict(features_test)
    #save the output in CSV format
    kaggle.kaggleize(predict, path+"Submission/Predictions/RobotArm/SupportVectorRegression_CrossValidated.csv")
    kaggle.kaggleize(predict, path+"Submission/Predictions/RobotArm/best.csv")

    #Plot graph representing Regression coefficient vs Error
    plt.figure(1, figsize=(6,4))
    plt.plot(errors1.keys(),errors1.values(),'sb-', linewidth=3) #Plot the first series in blue with square marker
    plt.ylabel("Error") #Y-axis label
    plt.xlabel("Regression Coefficient") #X-axis label
    plt.title("Error vs Regression Coefficient for Robot Dataset") #Plot title
    #Save the chart
    plt.savefig(path+"/Submission/Figures/ErrorVsRegressionCoefficient_Robot.pdf")
    plt.show()
    
    #Plot graph representing Degree vs Error
    plt.figure(2, figsize=(6,4))
    plt.plot(errors2.keys(),errors1.values(),'or-', linewidth=3) #Plot the second series in red with square marker
    plt.ylabel("Error") #Y-axis label
    plt.xlabel("Degree") #X-axis label
    plt.title("Degree vs Mean Square Error for Robot Dataset") #Plot title
    #Save the chart
    plt.savefig(path+"/Submission/Figures/ErrorVsDegree_Robot.pdf")
    plt.show()
예제 #28
0
#model_selection.credit_card(train_x, train_y)
result_cc = model_selection.credit_card(train_x, train_y)
print(result_cc, "\n")
print("Best parameter is: ", min(result_cc, key = result_cc.get) )


clf = KernelRidge(alpha=0.0001, kernel='rbf', gamma=None)
clf.fit(train_x, train_y)
predicted_y = clf.predict(test_x)

# Output file location
file_name = '../Predictions/CreditCard/best.csv'
# Writing output in Kaggle format
print('Writing output to ', file_name)
kaggle.kaggleize(predicted_y, file_name, True)


######################### 2.a

train_x, train_y, test_x  = read_tumor_data()
print('Train=', train_x.shape)
print('Test=', test_x.shape)

result_t = model_selection.tumor(train_x, train_y)
print(result_t, "\n")
print("Best parameter is: ", max(result_t, key = result_t.get))

clf = SVC(C=1.0, kernel='rbf', gamma=0.001)
clf.fit(train_x, train_y)
predicted_y = clf.predict(test_x)
def kagglizing(predicted_y, best):
    file_name = '../Predictions/' + best + '.csv'
    # Writing output in Kaggle format
    print('Writing output to ', file_name)
    kaggle.kaggleize(predicted_y, file_name)
예제 #30
0
def compute_DT(train_x, train_y, test_x):
    # Different values of max_depth to run
    depth = [3, 6, 9, 12, 15]
    # Initialize Variables
    train_score = [0] * len(depth)
    test_score = [0] * len(depth)
    fit_time = [0] * len(depth)
    score_time = [0] * len(depth)

    # make MAE scoring
    MAE = make_scorer(compute_error, greater_is_better=False)

    # indexing
    index = 0
    for d in depth:
        # Create the model
        regr = DecisionTreeRegressor(criterion="mae", max_depth=d)
        # Cross Validation
        cv_score = cross_validate(regr,
                                  train_x,
                                  train_y,
                                  return_train_score=True,
                                  scoring=MAE,
                                  cv=5)
        # Extract Statistics, scorer negates compute_error output
        train_score[index] = -cv_score['train_score'].mean()
        test_score[index] = -cv_score['test_score'].mean()
        fit_time[index] = 1000 * cv_score['fit_time'].sum()
        score_time[index] = 1000 * cv_score['score_time'].sum()

        # Print Statistics
        print('Depth of Decision Tree:', d)
        print('train score', train_score[index])
        print('test score', test_score[index])
        print('fit time', fit_time[index])
        print('score time', score_time[index])
        print('===============================')

        # Fit Model
        regr.fit(train_x, train_y)

        # Make Prediction
        test_y = regr.predict(test_x)

        # Create test output values
        predicted_y = test_y * -1
        # Output file location
        file_name = '../Predictions/Decision_Tree_depth_%d.csv' % d
        # Writing output in Kaggle format
        print('Writing output to ', file_name)
        kaggle.kaggleize(predicted_y, file_name)

        # Increase indexing
        index = index + 1

    # Plot CV Time
    plt.figure(num=None, figsize=(16, 8), dpi=80, facecolor='w', edgecolor='k')
    plt.plot(depth, fit_time, '.')
    plt.xlabel('Depth of Decision Tree')
    plt.ylabel('Cross Validation Time [msec]')
    plt.savefig('../Figures/DT_cv_time.png')
예제 #31
0
def choose_regression_model(problem_instance):
    if problem_instance == 1:
        #Load the Computer Activity Data
        path = '../../Data/ComputerActivity/'
    elif problem_instance == 2:
        #Load the Housing Data
        path = '../../Data/Housing/'

    data = np.load(path + 'Data.npz')
    features_train = data['X_train']
    labels_train = data['y_train']
    features_test = data['X_test']
    labels_test = data['y_test']
    n_estimator = []
    print("Computer Activity:", features_train.shape, labels_train.shape, features_test.shape, labels_test.shape)

    #Regression Method
    if problem_instance == 1:
        print("Executing Computer Activity problem")
        #transform = feature_selection.SelectKBest(feature_selection.f_regression)
        transform = feature_selection.RFECV(estimator = RidgeCV())
        pipeline = Pipeline([('anova', transform), ('adr', ensemble.GradientBoostingRegressor(random_state=404))])
        n_estimator = np.arange(75, 86, 1)
        depth = range(6, 8)
        #n_estimator = np.arange(10, 100, 10)
        parameters = {'anova__cv': [5,10],
                      #'anova__k': np.arange(15, 22, 1),
                      'adr__n_estimators': n_estimator,
                      'adr__max_depth': depth
                      }
    elif problem_instance == 2:
        print("Executing Housing problem")
        transform = feature_selection.SelectKBest(feature_selection.f_regression)
        #transform = feature_selection.RFECV(estimator = RidgeCV())
        n_estimator = np.arange(130, 160, 10)
        #n_estimator = np.arange(10, 100, 1)
        depth = range(6,8)
        pipeline = Pipeline([('anova', transform), ('adr', ensemble.GradientBoostingRegressor(random_state=404))])
        parameters = {'anova__k': np.arange(5, 9, 1),
                      #'anova__cv': [5,10],
                      'adr__n_estimators': n_estimator, #50
                      'adr__max_depth': depth
                      }
    grid = grid_search.GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
    grid.fit(features_train, labels_train)
    predictions = grid.predict(features_test)
    print(grid.best_params_, grid.best_score_, grid.best_estimator_, grid.grid_scores_)
    scores = grid.grid_scores_
    #print(type(scores), len(scores))
    mean_score_list = []
    parameters_list = []
    for x in range(0, len(scores)):
        mean_score_list.append(scores[x][1])
        parameters_list.append(scores[x][0])
    print(mean_score_list)
    #print(parameters_list)
    scores_list = np.array(mean_score_list)
    plot_score_linechart(scores_list, problem_instance)
    if problem_instance ==1:
        kaggle.kaggleize(predictions, "../Predictions/ComputerActivity/test.csv")
    else:
        kaggle.kaggleize(predictions, "../Predictions/Housing/test.csv")