def run_random_forest(training_data, training_labels, validation_data, validation_labels,
                      best_max_depth=[], best_min_samples_leaf=[]):
    n_estimators_list = range(1, 51)
    training_accuracy_list = []
    validation_accuracy_list = []
    for this_n_estimator in n_estimators_list:
        print('Processing n estimator: ' + str(this_n_estimator) + '/' + str(len(n_estimators_list)))
        if best_max_depth == []:
            clf = rfc(n_estimators=this_n_estimator)
        else:
            clf = rfc(n_estimators=this_n_estimator, max_depth=best_max_depth,
                      min_samples_leaf=best_min_samples_leaf)
        (training_accuracy, validation_accuracy) = get_training_accuracy.run(clf, training_data,
                                                                             training_labels,
                                                                             validation_data,
                                                                             validation_labels)
        training_accuracy_list.append(training_accuracy)
        validation_accuracy_list.append(validation_accuracy)
        print(CURSOR_UP_ONE + ERASE_LINE + CURSOR_UP_ONE)
    
    # Plot data ------------------------------------------------------------------------------------
    training_accuracy_list = [training_accuracy*100 for training_accuracy
                              in training_accuracy_list]
    validation_accuracy_list = [validation_accuracy*100 for validation_accuracy 
                                in validation_accuracy_list]

    pylab.plot(n_estimators_list, training_accuracy_list)
    pylab.plot(n_estimators_list, validation_accuracy_list)
    
    pylab.xlabel('N Estimators')
    pylab.ylabel('Accuracy (% out of 100)')
    pylab.legend(['Training Accuracy', 'Validation Accuracy'], loc=2)
    pylab.grid(True)
    if best_max_depth == []:
        pylab.title('Training and Validation Accuracy as function of N Estimators')
        pylab.savefig("Accuracy_vs_N_Estimators.png")
    else:
        pylab.title('Training and Validation Accuracy as function of N Estimators With' +
                    ' Best Max Depth and Best Min Sample Leaf')
        pylab.savefig("Accuracy_vs_N_Estimators_modified.png")
    #pylab.show()
    pylab.close()
    pylab.clf()
    # End plot data --------------------------------------------------------------------------------

    (best_index, best_accuracy) = max(enumerate(validation_accuracy_list), key = itemgetter(1))
    best_n_estimator = n_estimators_list[best_index]
    return (best_n_estimator, best_accuracy)
Exemplo n.º 2
0
def regression(data, y, model="forest"):
    if model == "forest":
        from sklearn.ensemble import RandomForestRegressor as rfc
        est = rfc(n_estimators=10, n_jobs=-1)

    elif model == "tree":
        from sklearn.tree import DecisionTreeRegressor as dtc
        est = dtc()

    elif model == "extra":
        from sklearn.ensemble import ExtraTreesRegressor as etc
        est = etc(n_estimators=10, n_jobs=-1)

    elif model == "linear":
        from sklearn.linear_model import LinearRegression as lr
        cases = y.nunique()
        est = lr(n_jobs=-1)

    elif model == "svm":
        from sklearn.svm import SVR as svc
        est = svc()

    elif model == "boost":
        from sklearn.ensemble import GradientBoostingRegressor as gbc
        est = gbc(n_estimators=10)

    elif model == "neural":
        from sklearn.neural_network import MLPRegressor as nnc
        est = nnc(max_iter=10, learning_rate_init=1)

    est.fit(data, y)
    return est
Exemplo n.º 3
0
def classifier(data, y, model="forest"):
    if model == "forest":
        from sklearn.ensemble import RandomForestClassifier as rfc
        est = rfc(n_estimators=10, n_jobs=-1)

    elif model == "tree":
        from sklearn.tree import DecisionTreeClassifier as dtc
        est = dtc()

    elif model == "extra":
        from sklearn.ensemble import ExtraTreesClassifier as etc
        est = etc(n_estimators=10, n_jobs=-1)

    elif model == "logistic":
        from sklearn.linear_model import LogisticRegression as lr
        cases = y.nunique()
        if cases > 2: est = lr(solver="newton-cg", multi_class="multinomial")
        else: est = lr(n_jobs=-1)

    elif model == "svm":
        from sklearn.svm import SVC as svc
        est = svc()

    elif model == "boost":
        from sklearn.ensemble import GradientBoostingClassifier as gbc
        est = gbc(n_estimators=10)

    elif model == "neural":
        from sklearn.neural_network import MLPClassifier as nnc
        est = nnc(max_iter=10, learning_rate_init=1)

    est.fit(data, y)
    return est
def test_model():
    dataX, dataY = readData()
    dataY = dataY.reshape(dataY.shape[0])
    x_train = dataX[:m, :]
    x_test = dataX[m:, :]
    y_train = dataY[:m]
    y_test = dataY[m:]

    for i in range(100):
        n_estimators = 100
        model = rfc(n_estimators=n_estimators)
        model.fit(x_train, y_train)
        # save the model to disk
        pickle.dump(model, open(filename, 'wb'))
        score = model.score(x_test, y_test)
        print("acc = ", score)

        acc_pre = np.max(model.predict_proba(x_train), 1)
        # print (acc_pre[:1000])
        label_pre = model.predict(x_train)
        for i in range(len(acc_pre)):
            if label_pre[i] != y_train[i]:
                if (acc_pre[i] > 0.8):
                    print(
                        "label_predict = {}, label_Real = {}, ACC = {}".format(
                            label_pre[i], y_train[i], acc_pre[i]))
                    y_train[i] = label_pre[i]
Exemplo n.º 5
0
def rfc_learning_curve(features,
                       labels,
                       training_sizes,
                       gini,
                       score='accuracy',
                       perc=False,
                       return_raw=False):
    st = 10
    clf = rfc(n_estimators=20,
              max_depth=8,
              max_features=None,
              min_impurity_decrease=gini,
              random_state=st)
    ss = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=st)
    #f1 = f1_score(
    train_sizes, train_scores, test_scores = learning_curve(
        clf,
        features,
        labels,
        cv=ss,
        train_sizes=training_sizes,
        shuffle=True,
        scoring=score,
        random_state=st)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_var = np.percentile(
        test_scores, 95, axis=1) if perc else np.std(test_scores, axis=1)
    if return_raw:
        return train_sizes, test_scores
    else:
        return train_sizes, test_scores_mean, test_scores_var
Exemplo n.º 6
0
def train_model(currHist, context):
    #training datasets
    trainingX = []
    trainingY = []
    priceChanges = []
    #delta
    for i in range(len(currHist) - 1):
        priceChanges.append((currHist[i + 1] - currHist[i]) / currHist[i])

    #dataset creation
    for i in range(
            len(currHist) - (context.historicalDays + context.predictionDays)):
        currDay = (i + context.historicalDays + context.predictionDays)
        currValue = 0
        if currHist[currDay] > currHist[currDay - context.predictionDays] * (
                1 + context.percentChange):
            currValue = 1
        elif currHist[currDay] < currHist[currDay - context.predictionDays] * (
                1 - context.percentChange):
            currValue = -1
        tempList = []
        for j in range(context.historicalDays - 1):
            tempList.append(priceChanges[i + j])
        trainingX.append(tempList)
        trainingY.append(currValue)

    #classifier
    clf = rfc()
    clf.fit(trainingX, trainingY)
    return (clf)
Exemplo n.º 7
0
def createMoodTestModel(train, test):
    forest = rfc()
    forest.fit(np.delete(train, -2, 1), train[:, -2])
    scores = cross_val_score(forest, np.delete(test, -2, 1), test[:, -2])
    print test.shape
    print train.shape
    print scores.mean()
Exemplo n.º 8
0
def randomForest_new(trainData, trainTarget, testData, testTarget, Act):
    print '==========Using Random forest classifier=========='

    trainX = np.array(trainData).astype(np.float)
    trainy = np.array(trainTarget).astype(np.float)
    testX = np.array(testData).astype(np.float)
    testy = np.array(testTarget).astype(np.float)

    clf = rfc(n_estimators=120)
    clf.fit(trainX, trainy)
    print clf.score(testX, testy)
    y_pred = clf.predict(testX)
    y_preAr = precision_score(testy, y_pred, average=None)

    if Act != 'dummy':
        perf_measure(testy, y_pred, Act + ', Random Forest')
        y_preAr = precision_score(testy, y_pred, average=None)

        precision, recall, fscore, support = score(testy, y_pred)
        #Sprint clf.predict_proba(X_test)
        x = roc_auc_score(testy, y_pred)
        print 'The roc auc score is: ', x
        print 'The Avg precision score is:', average_precision_score(
            testy, y_pred)
        print('precision: {}'.format(precision))
        print('recall: {}'.format(recall))
        print('fscore: {}'.format(fscore))
        print('support: {}'.format(support))
        return x
Exemplo n.º 9
0
def randomForest(X, y, Act):
    print '==========Using Random forest classifier=========='
    X1 = np.array(X).astype(np.float)
    y1 = np.array(y).astype(np.float)
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X1, y1, test_size=0.4, random_state=0)
    clf = rfc(n_estimators=120)
    clf.fit(X_train, y_train)
    print clf.score(X_test, y_test)
    y_pred = clf.predict(X_test)
    y_preAr = precision_score(y_test, y_pred, average=None)
    #EERCalc(clf.predict_proba(X_test), y_test, y_pred,"RF")
    #print clf.predict_proba(X_test)
    if Act != 'dummy':
        perf_measure(y_test, y_pred, Act + ', Random Forest')
        y_preAr = precision_score(y_test, y_pred, average=None)
        precision, recall, fscore, support = score(y_test, y_pred)
        #Sprint clf.predict_proba(X_test)
        x = roc_auc_score(y_test, y_pred)
        print 'The roc auc score is: ', x
        print 'The Avg precision score is:', average_precision_score(
            y_test, y_pred)
        print('precision: {}'.format(precision))
        print('recall: {}'.format(recall))
        print('fscore: {}'.format(fscore))
        print('support: {}'.format(support))
    return EERCalc(clf.predict_proba(X_test), y_test, y_pred, "RF")
Exemplo n.º 10
0
def RandomForest(df, df_pred):
    df_X = df.drop('goes_up', axis=1)
    df_y = df['goes_up']
    X = df_X.to_numpy()
    y = df_y.to_numpy()
    pred_arr = df_pred.to_numpy()
    pred_arr = np.nan_to_num(pred_arr)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=42)

    #params = {'criterion':('gini', 'entropy'),
    #      'max_depth':(3, 5, 7, 9),
    #      'min_samples_leaf':(3, 5, 8, 10)
    #      }

    #clf = GridSearchCV(rfc(), param_grid=params, cv=5)
    #clf.fit(X_train, y_train)
    #clf.best_params_

    model_rfc = rfc(criterion='entropy',
                    max_depth=5,
                    random_state=0,
                    min_samples_leaf=5)
    model_rfc.fit(X_train, y_train)

    df_RFC = model_rfc.predict(pred_arr)

    return roc_auc_score(y_test, model_rfc.predict(X_test)), df_RFC
def train_model_rfc_calibrated (features, labels) :
	# First, set aside a some of the training set for calibration
	# Use stratified shuffle split so that class ratios are maintained after the split
	splitter = StratifiedShuffleSplit(labels, n_iter = 1, train_size = 0.7, random_state = 30)

	# Length is 1 in this case since we have a single fold for splitting
	print (len(splitter))

	for train_idx, calib_idx in splitter:
		features_train, features_calib = features[train_idx], features[calib_idx]
		labels_train, labels_calib = labels[train_idx], labels[calib_idx]

	print ("features_train shape: ", features_train.shape)
	print ("features_calib shape: ", features_calib.shape)
	print ("labels_train shape: ", labels_train.shape)
	print ("labels_calib shape: ", labels_calib.shape)
		
	print ("Performing Grid Search ...")
	# params_dict = {'criterion': ['entropy'], 'n_estimators':[30, 35, 40, 45], 'max_depth':[5, 6], 'min_samples_leaf': [1, 2, 5], 'min_samples_split': [2, 5, 10]}
	params_dict = {'criterion': ['entropy'], 'n_estimators':[60, 70, 80, 90], 'max_depth':[5, 6], 'min_samples_leaf': [1, 2, 5], 'min_samples_split': [2, 5, 10], 'max_features' : [6, 7, 8]}
	clf = GridSearchCV(rfc(random_state = 30, n_jobs = 4), params_dict, scoring = 'roc_auc', cv = 5)
	clf.fit(features_train, labels_train)

	print ("Best estimator: ", clf.best_estimator_)
	print ("Best best scores: %.4f" %(clf.best_score_))
	# print ("Best grid scores: ", clf.grid_scores_)

	# Perform calibration 
	# Use 'sigmoid' because sklearn cautions against using 'isotonic' for lesser than 1000 calibration samples as it can result in overfitting
	print ("Performing Calibration now ...")
	sigmoid = CalibratedClassifierCV(clf, cv='prefit', method='sigmoid')
	sigmoid.fit(features_calib, labels_calib)
	return sigmoid
def getResult(url):

    #Importing dataset 
    filen = 'dataset.csv'
    r = open(filen,'rt')
    data = np.loadtxt(r, delimiter = ",") # loading the dataset

    #lets seperating features and labels
    X = data[: , :-1]
    y = data[: , -1]

    #Seperating training features, testing features, training labels & testing labels
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
    clf = rfc()
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print(score*100)  # accuracy score

    X_new = []

    X_input = url # checking for catogery
    X_new=feature_extraction.generate_data_set(X_input) # extracting features of given url
    X_new = np.array(X_new).reshape(1,-1)  # converting 

    try:
        prediction = clf.predict(X_new)
        if prediction == -1:
            return "Omg!!!.. its Phishing Url"
        else:
            return "hureeh!!....its a Genuine Url"
    except:
        return "Omg!!... its a Phishing Url"
Exemplo n.º 13
0
def rf_classifier(X_train, y_train, X_test, y_test, method, estimators,
                  num_features, preprocessing_method):
    print('Random Forest Classification using estimators', estimators,
          'and preprocessing via', method)
    classifier = rfc(n_estimators=estimators)

    if method == 'pca':
        print('Performing dimensional reduction with features', num_features)
        X_train = dimensional_reduction(X_train.astype(float),
                                        y_train,
                                        num_features=num_features)
        X_test = dimensional_reduction(X_test.astype(float),
                                       y_test,
                                       num_features=num_features)
    else:
        X_train = sklearn_preprocessing(X_train.astype(float),
                                        y_train.astype(float),
                                        preprocessing_method)
        X_test = sklearn_preprocessing(X_test.astype(float),
                                       y_test.astype(float),
                                       preprocessing_method)

    classifier.fit(X_train, y_train)

    y_test_predicted = classifier.predict(X_test)
    return classifier, X_train, y_train, y_test_predicted
Exemplo n.º 14
0
def initialize_models(X_train, y_train, X_test, y_test, accuracy, fscore):
    # TODO: Initialize the three models
    clf_A = dtc(random_state=13)
    clf_B = rfc(random_state=13)
    clf_C = abc(random_state=13)

    # TODO: Calculate the number of samples for 1%, 10%, and 100% of the training data
    # HINT: samples_100 is the entire training set i.e. len(y_train)
    # HINT: samples_10 is 10% of samples_100 (ensure to set the count of the values to be `int` and not `float`)
    # HINT: samples_1 is 1% of samples_100 (ensure to set the count of the values to be `int` and not `float`)
    samples_100 = len(y_train)
    samples_10 = len(y_train) // 10
    samples_1 = len(y_train) // 100

    # Collect results on the learners
    results = {}
    for clf in [clf_A, clf_B, clf_C]:
        clf_name = clf.__class__.__name__
        results[clf_name] = {}
        for i, samples in enumerate([samples_1, samples_10, samples_100]):
            results[clf_name][i] = train_predict(clf, samples, X_train, y_train, X_test, y_test)

    # Run metrics visualization for the three supervised learning models chosen
    vs.evaluate(results, accuracy, fscore)
    return clf_C
def resolve(url):
    # Importing dataset
    data = np.loadtxt(os.path.dirname(__file__) + "/dataset.csv",
                      delimiter=",")

    # Seperating features and labels
    X = data[:, :-1]
    y = data[:, -1]

    # Seperating training features, testing features, training labels & testing labels
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    clf = rfc()
    clf.fit(X_train, y_train)

    # rfc.predict(X_train, X_test);

    score = clf.score(X_test, y_test)

    # print("accuracy = ", score * 100)

    X_new = []

    X_input = url
    X_new = feature_extraction.generate_data_set(X_input)
    X_new = np.array(X_new).reshape(1, -1)

    try:
        prediction = clf.predict(X_new)
        if prediction == 1:
            return "Legitimate Url"
        else:
            return "Suspicious Url"
    except:
        return "Phishing Url"
Exemplo n.º 16
0
def RandomForestClassifer():
    
    #loading dataset
    data = np.loadtxt("dataset.csv", delimiter = ",")
    
    #seperate features and labels, 1-30 are features and 31 is label
    x = data[: , :-1]
    y = data[: , -1]
    
    #Seperating training features, testing features, training labels & testing labels
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)    #here 20% data is for testing
    #x variables contain features and y contains results
    
    print("Training a random forest model on given dataset")
    start = time.time()                                         #store the start time for training and testing of model
    classifier = rfc()                                          #using random forest classifier model
    print("Random Forest classifier created.")
    print("Beginning model training.")
    classifier.fit(x_train, y_train)                            #train the model
    print("Model training completed.")
    predictions = classifier.predict(x_test)                    #do predictions on the model for testing data
    print("Predictions on testing data computed.")
    end = time.time ()                                          #store the end time for training and testing of model
    accuracy = 100.0 * accuracy_score(y_test, predictions)      #calculate accuracy of the model and store it in 'score' variable
    print("The accuracy of your random forest model on testing data is: " + str(accuracy) + " %")
    f1score = f1_score (y_test, predictions)            #calculate f1 score of the model and store it in 'f1score' variable
    print ("The f1-score of your random forest model on testing data is: " + str (f1score))
    precision = precision_score (y_test, predictions)   #calculate precision score of the model and store it in 'precision' variable
    print ("The precision of your random forest model on testing data is: " + str (precision))
    recall = recall_score (y_test, predictions)         #calculate recall score of the model and store it in 'recall' variable
    print ("The recall of your random forest model on testing data is: " + str (recall))
    runtime = end - start                                       #calculate and store total time taken for training and testing of model
    print ("Total time taken for training and testing by random forest model is: " + str (runtime) + " s")
Exemplo n.º 17
0
def getResult(url):

    #Importing dataset
    data = np.loadtxt("dataset.csv", delimiter=",")

    #Seperating features and labels
    X = data[:, :-1]
    y = data[:, -1]

    #Seperating training features, testing features, training labels & testing labels
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    clf = rfc()
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print(score * 100)

    X_new = []

    X_input = url
    X_new = feature_extraction.generate_data_set(X_input)
    X_new = np.array(X_new).reshape(1, -1)

    try:
        prediction = clf.predict(X_new)
        if prediction == -1:
            return "Phishing Url"
        else:
            return "Legitimate Url"
    except:
        return "Phishing Url"
Exemplo n.º 18
0
def HCC(header_in,
        train_in,
        test_in,
        tscore="SP",
        baseClassifier=rfc(),
        type_prop="all_probabilities"):
    return hcc.HCC(header_in, train_in, test_in, tscore, type_prop,
                   baseClassifier)
Exemplo n.º 19
0
def decision_tree(num_tree, depth, need_stretch):
    train = preprocess(images, need_stretch)
    test = preprocess(test_img, need_stretch)
    model = rfc(n_estimators=num_tree, max_depth=depth)
    model.fit(train, train_y)
    train_res = model.predict(train)
    test_res = model.predict(test)
    return train_res, test_res
Exemplo n.º 20
0
def random_forest_classifier(x_train, y_train, x_test, y_test,tree):
    model = rfc(n_estimators=tree, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,
                min_weight_fraction_leaf=0.0, max_features="auto", max_leaf_nodes=None, bootstrap=False,
                oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None)
    model.fit(x_train, y_train)
    predicted = model.predict(x_test)
    expected = y_test
    return expected, predicted
def get_feature_importances (features, labels) :
	# clf = gbc(random_state = 30, max_depth = 3, n_estimators = 100, min_samples_leaf = 2, min_samples_split = 2, learning_rate = 0.05, subsample = 0.9)
	clf = rfc(random_state = 30, max_depth = 6, n_estimators = 100, min_samples_leaf = 1, min_samples_split = 2, n_jobs = 4, criterion = 'entropy')
	clf.fit(features, labels)
	# print ("Feature Importances: ",  clf.feature_importances_)
	print ("Header", header)
	print ("Feature_Importances: ", sorted(zip(map(lambda x: round(x, 5), clf.feature_importances_), header[1:]), 
             reverse=True))
	return clf
Exemplo n.º 22
0
def random_forest_pred(input):
    x = iris.iloc[:, :4]
    y = iris.iloc[:, 4]

    clf = rfc(n_jobs=2)
    clf.fit(x, y)

    y_pred = clf.predict(input)
    return y_pred
Exemplo n.º 23
0
def randomClassification(x, y, testX, testY):

    model = rfc()

    model.fit(x, y)

    print("Fitting Complete. Displaying Results... / 모델 피팅 성공. 결과 출력...")

    print("R^2 Score:",model.score(testX, testY))
def RFC(test_data, test_label, train_data, train_label, d):
    rfc_classifier = rfc(n_estimators=d)
    #It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable.
    rfc_train_error = rfc_classifier.fit(train_data, train_label).score(
        train_data, train_label)
    rfc_test_error = rfc_classifier.score(test_data, test_label)
    y_predict = rfc_classifier.predict(test_data)
    return y_predict, (1 - rfc_train_error) * len(train_data), (
        1 - rfc_test_error) * len(test_data)
Exemplo n.º 25
0
 def get_default_classifier(self):
     if(self.classifier == 'rfc'):
         return rfc()
     if(self.classifier == 'gbc'):
         return gbc()
     if(self.classifier == 'svc'):
         return svc()
     
     raise Exception("Only the classifiers 'rfc', 'svc', or 'gbc' are allowed")
Exemplo n.º 26
0
 def __init__(self, trees, depth, class_to_int, int_to_class, feat,
              classes):
     self.class_to_int = class_to_int
     self.int_to_class = int_to_class
     self.model = rfc(n_estimators=trees,
                      max_depth=depth,
                      n_jobs=-1,
                      verbose=1)
     self.model.fit(feat, classes)
def random_forest():
    x = iris.iloc[:, :4]
    y = iris.iloc[:, 4]

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
    clf = rfc(n_jobs=2)
    clf.fit(x_train, y_train)

    y_pred = clf.predict(x_test)
    return accuracy_score(y_test, y_pred)
Exemplo n.º 28
0
def randomforest(filename):
    trainX, trainY = modelcv.modelinput(tempfile, 33)
    clf = rfc(n_estimators=250,
              min_samples_split=6,
              n_jobs=-1,
              class_weight='balanced')
    clf.fit(trainX, trainY)
    #score=cross_val_score(clf, trainX, trainY, cv=3,verbose=True,n_jobs=-1)
    #print(np.average(score))
    inputfile = '../models/RFCmodel.sav'
    joblib.dump(clf, inputfile, compress=9)
Exemplo n.º 29
0
def randomforest(X_train, y_train):
    param_rfc = {
        'n_estimators': 500,
        'max_depth': 6,
        'min_samples_split': 4,
        'min_samples_leaf': 2,
        'max_features': 0.5,
        'n_jobs': 4
    }
    clf_rfc = rfc(**param_rfc)
    clf_rfc.fit(X_train, y_train)
    return clf_rfc
Exemplo n.º 30
0
def randomForest_new(X_train1, y_train1, X_test1, y_test1):
    X_train = np.array(X_train1).astype(np.float)
    y_train = np.array(y_train1).astype(np.float)
    X_test = np.array(X_test1).astype(np.float)
    y_test = np.array(y_test1).astype(np.float)
    clf = rfc(n_estimators=120)
    print '==========Using Random forest classifier=========='
    clf.fit(X_train, y_train)
    print clf.score(X_test, y_test)
    y_pred = clf.predict(X_test)
    perf_measure_new(y_test, y_pred)
    return EERCalc(clf.predict_proba(X_test), y_test, y_pred, "knn")
def train_model_adab_stacked_rfc (features, labels) :
	base_model = rfc(n_estimators = 80, max_features = 7,
                      max_depth=6, random_state = 30,
                      criterion = 'entropy')
	# model = BaggingClassifier(base_estimator = base_model)
	params_dict = {'learning_rate' : [0.03, 0.05, 0.1], 'n_estimators':[20, 50, 100]}
	
	clf = GridSearchCV(adab(random_state = 30, base_estimator = base_model), params_dict, n_jobs = -1, scoring = 'roc_auc', cv = 5)
	clf.fit(features, labels)

	print ("Best estimator: ", clf.best_estimator_)
	print ("Best best scores: %.4f" %(clf.best_score_))
	return clf
Exemplo n.º 32
0
 def fit(self, X, s):  #Function that calculates value of c.
     if self.trad_clf is None:
         self.trad_clf = rfc(n_estimators=1500,
                             class_weight="balanced",
                             n_jobs=4)
     c = np.zeros(self.n_folds)
     skf = StratifiedKFold(n_splits=self.n_folds, shuffle=True)
     for i, (itr, ite) in enumerate(skf.split(X, s)):
         self.trad_clf.fit(X[itr], s[itr])
         c[i] = self.trad_clf.predict_proba(X[ite][s[ite] == 1])[:,
                                                                 1].mean()
     self.c = c.mean()
     return self
def train_model_bagging (features, labels) :
	base_model = rfc(n_estimators = 80, max_features = 20,
                      max_depth=6, random_state = 30,
                      criterion = 'entropy')
	# model = BaggingClassifier(base_estimator = base_model)
	params_dict = {'max_features': [0.5, 0.8], 'max_samples': [0.5, 0.8, 1], 'n_estimators':[25, 50, 75]}
	
	clf = GridSearchCV(BaggingClassifier(random_state = 30, n_jobs = -1, base_estimator = base_model), params_dict, scoring = 'roc_auc', cv = skf(labels, n_folds = 5, random_state = 30))
	clf.fit(features, labels)

	print ("Best estimator: ", clf.best_estimator_)
	print ("Best best scores: %.4f" %(clf.best_score_))
	return clf
Exemplo n.º 34
0
def getTrainedCLassifier(classifierType, train):
    if classifierType == "naiveBayes":
        from nltk.classify import NaiveBayesClassifier
        trainedClassifier = NaiveBayesClassifier.train(train)
    elif classifierType == "randomForest":
        from sklearn.ensemble import RandomForestClassifier as rfc
        trainedClassifier = SklearnClassifier(rfc(n_estimators=25, n_jobs = 2))
        trainedClassifier.train(train)
    elif classifierType == "knn5":
        from sklearn.neighbors import KNeighborsClassifier as knn
        trainedClassifier = SklearnClassifier(knn(5))
        trainedClassifier.train(train)
    return trainedClassifier
Exemplo n.º 35
0
def model_randomforest_classifier(X_train, X_test, y_train, y_test):
    model_name = f'model_{count}_randomforest_classifier'

    model = rfc()
    model.fit(X_train, y_train)
    model.independentcols = independentcols
    y_pred = model.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)
    # print(classification_report(y_test, y_pred))
    score = accuracy_score(y_test, y_pred)

    print(f'{model_name} accuracy: {score}')
    joblib.dump(model, f'model/{model_name}.joblib')
def train_model_rfc_calibrated_cv (features, labels, hold_out = False, train_sz = 0.9) :
	features_train, features_test = [], []
	labels_train, labels_test = [], []
	if (hold_out == True) :
		# First, set aside a some of the training set for calibration
		# Use stratified shuffle split so that class ratios are maintained after the split
		splitter = StratifiedShuffleSplit(labels, n_iter = 1, train_size = train_sz, random_state = 30)

		# Length is 1 in this case since we have a single fold for splitting
		print (len(splitter))

		for train_idx, test_idx in splitter:
			features_train, features_test = features[train_idx], features[test_idx]
			labels_train, labels_test = labels[train_idx], labels[test_idx]
	else :
		features_train = features
		labels_train = labels

	print ("features_train shape: ", features_train.shape)
	print ("labels_train shape: ", labels_train.shape)
	if (hold_out == True) :
		print ("features_test shape: ", features_test.shape)
		print ("labels_test shape: ", labels_test.shape)
		
	print ("Parameters selected based on prior grid Search ...")
	#clf = rfc(random_state = 30, n_jobs = 4, criterion = 'entropy', max_depth = 7, min_samples_leaf = 2, min_samples_split = 5, n_estimators = 50)
	#clf = rfc(random_state = 30, n_jobs = 4, criterion = 'gini', max_depth = 8, min_samples_leaf = 5, min_samples_split = 2, n_estimators = 120)
	# clf = rfc(random_state = 30, n_jobs = 4, criterion = 'gini', class_weight = 'auto', max_depth = 5, min_samples_leaf = 5, min_samples_split = 2, n_estimators = 100)
	clf = rfc(random_state = 30, n_jobs = 4, criterion = 'entropy', class_weight = 'auto', max_depth = 5, min_samples_leaf = 5, min_samples_split = 2, n_estimators = 60)

	# Perform calibration 
	# Use 'sigmoid' because sklearn cautions against using 'isotonic' for lesser than 1000 calibration samples as it can result in overfitting
	# 05/22 - Looks like isotonic does better than sigmoid for both Brier score and roc_auc_score.
	# Using 30-40% holdout actually improves ROC AUC for holdout score from 0.88 to 0.925 with CV=5
	print ("Performing Calibration now ...")
	# sigmoid = CalibratedClassifierCV(clf, cv=5, method='sigmoid')
	sigmoid = CalibratedClassifierCV(clf, cv=5, method='isotonic')
	sigmoid.fit(features_train, labels_train)

	if (hold_out == True) :
		# Calculate Brier score loss
		y_probs = sigmoid.predict_proba(features_test)[:, 1]
		clf_score = brier_score_loss(labels_test, y_probs)
		print ("Brier score: ", clf_score)
		auc_score = estimate_roc_auc (sigmoid, features_test, labels_test)

	return sigmoid
def train_model_rfc (features, labels) :
	# Start with reduced param space
	# Best came in at the higher end of 1000, 6, so increase
	# params_dict = {'criterion': ['entropy'], 'n_estimators':[40, 60, 80, 100], 'max_depth':[5, 6, 7], 'min_samples_leaf': [1, 2, 5], 'min_samples_split': [2, 5, 10], 'max_features' : [6, 7]}
	params_dict = {'class_weight' : ['auto'],  'criterion': ['entropy'], 'n_estimators':[50, 60, 70], 'max_depth':[4, 5, 6], 'min_samples_leaf': [1, 2, 5], 'min_samples_split': [2, 5, 10]}

	# params_dict = {'criterion': ['entropy'], 'n_estimators':[100, 150, 200, 250, 300], 'max_depth':[None], 'min_samples_split': [1, 2, 5], 'max_features': [6, 7, 8, 9]}
	
	### Train estimator (initially only on final count
	# skf = StratifiedKFold
	clf = GridSearchCV(rfc(random_state = 30, n_jobs = 4), params_dict, scoring = 'roc_auc', cv = 5)
	clf.fit(features, labels)

	print ("Best estimator: ", clf.best_estimator_)
	print ("Best best scores: %.4f" %(clf.best_score_))
	#print ("Best grid scores: ", clf.grid_scores_)
	return clf
def train_model_rfc_pipeline (features, labels) :
	scaler = StandardScaler()
	clf_rfc = rfc(random_state = 30, n_jobs = 4, criterion = 'entropy')

	# Transforms are applied exactly in the order specified
	estimators = [('sscaler', scaler), ('rfc', clf_rfc)]

	t0 = time.clock()
	
	# Use pipeline directly in GridSearchCV
	params_dict = {'rfc__n_estimators': [100, 300, 500, 700], 'rfc__max_depth': [1, 2, 3], 'rfc__min_samples_split':[10, 20, 50], 'rfc__min_samples_leaf':[1, 2, 5]}
	clf = GridSearchCV(Pipeline(estimators), params_dict, cv = 5, scoring = 'roc_auc')
	clf.fit(features, labels)

	print ("Grid Search CV time: ", time.clock() - t0 )
	print ("Best estimator: ", clf.best_estimator_)
	print ("Best grid scores: %.4f" %(clf.best_score_))
	return clf
Exemplo n.º 39
0
data=[]
for row in csv_file_object:
	data.append(row)
test_data = np.array(data)

# write the test data in to another file
open_file_object = csv.writer(open("cleaned_test.csv", "wb"))
for row in cleaned_train_data:
	open_file_object.writerow(row)
cleaned_test_data  = clean_test_data(test_data)

################################################################################

# Create the random forest object which will include all the parameters
# for the fit
Forest = rfc(n_estimators = 100)

# fit the training input and output to create the 
# decision trees
Forest = Forest.fit(cleaned_train_data[0::,2::],cleaned_train_data[0::,1])

output = Forest.predict(cleaned_test_data[0::,1::])

# generate and save the output csv file
fo = csv.writer(open("result.csv", "wb"))

res_array = np.zeros(shape=(len(output), 2))

print ('output is')
print (output)
print ('data is')
Exemplo n.º 40
0
# [1, 1, 0, ...]

print len(all_team)
print len(result)


print 'Elapsed time: %.2fs' % (time.time() - st)

st = time.time()
X_train, X_test, y_train, y_test = cross_validation.train_test_split(all_team, result, test_size=0.2, random_state=1)

# Try classifier
clf = SVC()
print 'done'
clf.fit(X_train, y_train)
result1 = clf.predict(X_test)
print classification_report(y_test, result1)
print accuracy_score(y_test, result1)
print 'Elapsed time: %.2fs' % (time.time() - st)

st = time.time()
clf2 = rfc(n_estimators=5)
clf2.fit(X_train, y_train)
result2 = clf2.predict(X_test)
print classification_report(y_test, result2)
print accuracy_score(y_test, result2)
print 'Elapsed time: %.2fs' % (time.time() - st)

cursor.close()
conn.commit()
conn.close()
#training = training.astype('float64')
#validation = training.astype('float64')
print 'partitioned data...'

# Splitting Training Up
y_train = training[:,1]    # labels
x_train = training[:,2:]   # everything else
# Splitting validation up
y_valid = validation[:,1]  # read y values
x_valid = validation[:,2:]

tune_grid = [{'n_estimators':[10,100,250,500],
			'criterion':['gini','entropy']
			}]

best_model = GridSearchCV( rfc(), tune_grid, cv=10, verbose=2, n_jobs=5).fit(x_train,y_train)

y_pred = best_model.predict(x_valid)

p = []; v = [];
for i in range(len(y_pred)):
	p.append(int(y_pred[i]))

for j in range(len(y_valid)):
	v.append(int(y_valid[j]))

cm = confusion_matrix(v,p)
asm = accuracy_score(v,p)
print cm
print "Accuracy: %f" % (asm)
print best_model.best_estimator_
Exemplo n.º 42
0
'''

# load data into pandas data frame
trdata,testdata=mg.loadData()

# get the id's for the test set
testid = np.array(testdata.PassengerId)

# determine if each passenger has a known surviving family member
trdata,tesrdata=mg.addFamSurvivors(trdata,testdata)

# munge the data to generate one-hot labels for gender, titles, ticket departments
trdata=mg.mungeData(trdata)
testdata=mg.mungeData(testdata)


# initialize classifier

model= rfc(n_estimators=1000,oob_score=True,compute_importances=True)

model = model.fit(trdata.iloc[:,1:],trdata.iloc[:,0])

accur = model.oob_score_

print('Out of Bag accuracy: %f \n' %accur)

# generate predictions
preds = model.predict(testdata)

# save out
mg.writeout(preds,testid,'predictions/rfcmodel_test.csv')
Exemplo n.º 43
0
train_data = np.array(train_data)
test_data = np.array(test_data)
valid_data = np.array(valid_data)

train_class = np.ravel(np.array(train_class))
test_class = np.ravel(np.array(test_class))
valid_class = np.ravel(np.array(valid_class))

print train_data.shape
print test_data.shape
print valid_data.shape

#train_data = train_data[0:100,:]
#train_class = train_class[0:100]

svc = rfc(n_estimators=500, min_samples_split = 9,criterion='gini',compute_importances=True)
svm_parameters = {'n_estimators':[100,200,500,1000,2000], 'min_samples_split' : [3,5,7,9,11,13,15,17,19]}

clf = gsc(svc, svm_parameters)
#clf = svc

clf.fit(train_data,train_class)


print clf.grid_scores_

print clf.best_score_
print clf.best_estimator_
print clf.best_params_
Exemplo n.º 44
0
# initialize classifier

if __name__ == "__main__":

    num = 4

    train_data_file, test_data_file, test_result_dir = cmd.TrainTestFileParser(sys.argv, num)

    test_result_file = open(test_result_dir + "rfc_test_result" + str(num) + ".txt", "w+")

    trdata = pd.read_csv(train_data_file, header=None, sep=" ")
    tedata = pd.read_csv(test_data_file, header=None, sep=" ")

    # depthlist = [5,10,15,20,50,100]
    model = rfc(n_estimators=5000, oob_score=True, max_features=None, max_depth=10)
    model = model.fit(trdata.iloc[:, 1:], trdata.iloc[:, 0])
    accur = model.score(tedata.iloc[:, 1:], tedata.iloc[:, 0])
    resultClass = model.predict(tedata.iloc[:, 1:])
    # resultLogProba = model.predict_log_proba(tedata.iloc[:,1:])
    resultProba = model.predict_proba(tedata.iloc[:, 1:])

    for x, y in zip(resultClass, resultProba):
        test_result_file.write(str(x) + " ")
        for z in y:
            test_result_file.write(str(z) + " ")
        test_result_file.write("\n")

    print len(resultProba)

    print ("Test data accuracy: %f\n" % accur)
Exemplo n.º 45
0
dt = iris_data.data
lbls = iris_data.target


#train a KNN and see how does it perform. Keep 50000 for training and 10000 for validation and 10000 for final test.


num_fold = 10
gen_k_sets = StratifiedKFold(lbls,num_fold)
ab = []
for nb in range(1,136,1):
    
    dst_mdl = nn(n_neighbors=nb)
    overall_mis = 0
    mdl = SVC(C=1.0,kernel='linear')
    mdl=rfc(n_estimators=500)
  
    

    for train_index, test_index in gen_k_sets:   
        train_data, test_data = dt[train_index], dt[test_index]
        train_class, test_class = lbls[train_index], lbls[test_index]
    
    
        j = 0
        


        for i,td in enumerate(test_data):
            td = np.array(td)
            tst_class_act=test_class[i]
Exemplo n.º 46
0
lbls = np.array(dgts_lbl)
print lbls.shape

train_lbl,test_lbl,train_dt,test_dt = train_test_split(lbls,dt,test_size = 0.15,random_state=1299004)

clstrs = 10
clst = KMeans(n_clusters = clstrs,n_init=30,tol=0.00001,max_iter=500)
clst.fit(train_dt)
clsts_lbl = np.reshape(np.array(clst.labels_),(train_dt.shape[0],1))

#td = np.hstack((train_dt,clsts_lbl))
mdls = []

for i in range(clstrs):
    t_idx = np.where(clsts_lbl==i)[0]
    mdl = rfc(n_estimators=50,criterion='entropy',oob_score=True)
    mdl = etc(n_estimators=5000,criterion='entropy',oob_score=True,bootstrap=True,min_samples_split=30)
    #mdl = SVC(C=10000,gamma=0.00001,kernel='rbf')
    mdl = knn(n_neighbors=1)
    td = train_dt[t_idx]
    tc = train_lbl[t_idx]
    #print tc
    mdl.fit(td,tc)
    print mdl.score(td,tc)

    #print mdl.oob_score_
    mdls.append(mdl)
    
scrs= []
clst_lbl_tst = np.reshape(np.array(clst.predict(test_dt)),(test_dt.shape[0],1))
for i in range(clstrs):
Exemplo n.º 47
0
dgts_data = np.array(dgts_data)
print dgts_data.shape
print dgts_data

dgts_lbl = pd.read_csv("abcd_l.csv",index_col=0)
print dgts_lbl.head()
print dgts_lbl.shape
dgts_lbl = np.array(dgts_lbl)
print dgts_lbl.shape
print dgts_lbl

#train a KNN and see how does it perform. Keep 50000 for training and 10000 for validation and 10000 for final test.

gen_k_sets = StratifiedShuffleSplit(dgts_lbl, n_iter=1, test_size=0.20)
mdl = SVC()
mdl = rfc()
dst_mdl = nn(n_neighbors=100)

for train_index, test_index in gen_k_sets:   
    train_data, test_data = dgts_data[train_index], dgts_data[test_index]
    train_class, test_class = dgts_lbl[train_index], dgts_lbl[test_index]
    #test_data= test_data[:1000,]
    #test_class = test_class[:1000]
    #print g
    
    dst_mdl.fit(train_data)
    #print mdl.score(train_data,train_class)
    print train_data.shape
    j = 0
    for i,td in enumerate(test_data):
        td = np.array(td)
Exemplo n.º 48
0
cols = np.array(cols)
cols = list(cols[:,0])
print cols


train_data = train_data[:,cols]
test_data = test_data[:,cols]
valid_data = valid_data[:,cols] 

print train_data.shape
print test_data.shape
print valid_data.shape
"""
# train_class = train_class[0:100]

svc = rfc(n_estimators=500, min_samples_split=9, criterion="gini")
svm_parameters = {"n_estimators": [500], "min_samples_split": [9]}

clf = gsc(svc, svm_parameters)
clf = svc

clf.fit(train_data, train_class)
print

print clf.score(valid_data, valid_class)
print clf.score(test_data, test_class)
# print svc.feature_importances_
"""
print clf.grid_scores_

print clf.best_score_
Exemplo n.º 49
0
#    plt.subplot(2,5,i)
#    plt.hist(X[:,i],bins=500)


# Train the model

# pca = PCA(n_components=40)
# pca.fit(nptrd[:,range(1,94)])
# X = pca.transform(nptrd[:,range(1,94)])
# PCAExplained = sum(pca.explained_variance_ratio_)

# Most of the features are highly skewed i.e. their 75% value ranges when we do td.describe() is 0 while their max is much higher.
# This indicates that only a few values are non-zero for most features.
# This could mean that these features are actually categorical variables that are encoded in the test data.. could .. not sure

forest = rfc(n_estimators=100, n_jobs=-1, min_samples_split=20, min_samples_leaf=10)
# forest = forest.fit(nptrd[:,range(1,94)],nptrd[:,-1])
forest = forest.fit(X, nptrd[:, -1])

# temp = forest.predict(nptrd[:,range(1,94)])
temp = forest.predict(X)
TrainError = sum(temp == nptrd[:, -1]) / (len(nptrd) * 1.0)

# Need to spend some time checking for overfit - using some elbow techiques maybe


# Cross validate the model using the cross validation dataset

XCv = pipeline.transform(npcvd[:, range(1, 94)])
# output = forest.predict(npted[:,range(1,94)])
outputCv = forest.predict(XCv)
white_corr_rho=pd.DataFrame(white_corr_rho,index=range(0,11),columns=range(0,11))
white_corr_pval=pd.DataFrame(white_corr_pval,index=range(0,11),columns=range(0,11))
print white_corr_rho
print white_corr_pval

#RANDOM FOREST MODELING: RED---------------------------------------------------

#set iterations
iterations=20

#create empty data frames for prediction results and feature importances
red_results=pd.DataFrame(index=dfr_exp.index, columns=range(0,iterations))
red_features=pd.DataFrame(index=range(0,11), columns=range(0,iterations))

#fit model using StratifiedKFold
rf=rfc(n_estimators=360, max_features=5, criterion='gini')
for j in range(0,iterations):
    folds = skf(dfr_res, 5, shuffle=True)
    for train, test in folds:
        model=rf.fit(dfr_exp.ix[train,], dfr_res[train])
        red_results.ix[test,j] = pd.Series(model.predict(dfr_exp.ix[test,]), index=test, name=[j])
        red_features[j]=pd.Series(model.feature_importances_)
    print j

#write results to file
red_results.to_csv('C:/Users/mmcgoldr/Dropbox/GA/DataScience/Project/red_results.txt', sep='\t', header=True)
red_features.to_csv('C:/Users/mmcgoldr/Dropbox/GA/DataScience/Project/red_features.txt', sep='\t', header=True)

#retrieve results as needed
#red_results=pd.read_csv('C:/Users/mmcgoldr/Dropbox/GA/DataScience/Project/red_results.txt', sep='\t', header=False, names=range(0,iterations))
#red_features=pd.read_csv('C:/Users/mmcgoldr/Dropbox/GA/DataScience/Project/red_features.txt', sep='\t', header=False, names=range(0,iterations))
Exemplo n.º 51
0
print('Optimal N Estimator with default settings was: ' + str(best_n_estimator) +
      ' with accuracy: ' + str(best_n_estimator_accuracy))
print('Optimal N Estimator with modified settings was: ' + str(best_n_estimator_modified) + 
      ' with accuracy: ' + str(best_n_estimator_modified_accuracy))

# Get Test error with best configuration of Decision Tree and Random Forest
(training_data, training_labels, _, _) = preprocess_data.run_for_training_data(1)
(test_data, test_labels) = preprocess_data.run_for_test_data()

# Align data
missing_headers = training_data.columns.diff(test_data.columns)
test_data[missing_headers] = training_data[missing_headers].applymap(lambda x: False)

# Decision Tree
clf = dtc(criterion='entropy', max_depth=best_max_depth, min_samples_leaf=best_min_samples_leaf)
(_, test_accuracy_dt) = get_training_accuracy.run(clf, training_data, training_labels,
                                                  test_data, test_labels)

# Random Forest
clf = rfc(n_estimators=best_n_estimator_modified, max_depth=best_max_depth,
          min_samples_leaf=best_min_samples_leaf)
(_, test_accuracy_rf) = get_training_accuracy.run(clf, training_data, training_labels,
                                                  test_data, test_labels)

print('Test accuracy for Decision Tree: ' + str(test_accuracy_dt))
print('Test accuracy for Random Forest: ' + str(test_accuracy_rf))

print('\n=========================================================================================')
print('Script complete')
      

print f_data.shape


'''
pca = PCA(n_components=100)
pca.fit(dgts_data)
tr_dt_p = pca.transform(dgts_data)
print pca.explained_variance_ratio_
print tr_dt_p.shape
print sum(pca.explained_variance_ratio_)
'''

mdl = knn(n_neighbors= 13)
mdl = rfc(n_estimators=500,min_samples_split=5,min_samples_leaf=3,criterion='entropy')
gen_k_sets = StratifiedShuffleSplit(dgts_lbl, n_iter=1, test_size=0.15,random_state=10987)


for train_index, test_index in gen_k_sets:   
    train_data, test_data = dgts_data[train_index], dgts_data[test_index]
    train_class, test_class = dgts_lbl[train_index], dgts_lbl[test_index]
    mdl.fit(train_data,train_class)
    print mdl.score(test_data,test_class)
    #print mdl.feature_importances_


'''
mdl =  KMeans(n_clusters=10)
mdl.fit(tr_dt_p)
print mdl.labels_
Exemplo n.º 53
0
# Split data into training and cross-validation dataset
nptrd, npcvd = tts(trd,test_size=0.33)


# Train the model

pca = PCA(n_components=40)
pca.fit(nptrd[:,range(1,94)])
X = pca.transform(nptrd[:,range(1,94)])
PCAExplained = sum(pca.explained_variance_ratio_)

# Most of the features are highly skewed i.e. their 75% value ranges when we do td.describe() is 0 while their max is much higher. 
# This indicates that only a few values are non-zero for most features.
# This could mean that these features are actually categorical variables that are encoded in the test data.. could .. not sure

forest = rfc(n_estimators=500,criterion = 'entropy' , n_jobs=-1,min_samples_split=5,min_samples_leaf=5,max_depth=20)
#forest = forest.fit(nptrd[:,range(1,94)],nptrd[:,-1])
forest = forest.fit(X,nptrd[:,-1])

#temp = forest.predict(nptrd[:,range(1,94)])
temp = forest.predict(X)
TrainError = sum(temp == nptrd[:,-1]) / (len(nptrd)*1.0)

# Need to spend some time checking for overfit - using some elbow techiques maybe


# Cross validate the model using the cross validation dataset

XCv = pca.transform(npcvd[:,range(1,94)])
#output = forest.predict(npted[:,range(1,94)])
outputCv = forest.predict(XCv)
Exemplo n.º 54
0
n_estimators = [100, 200, 300, 400, 500]
best_cv_score = -9999.9999
best_n_est = 10000
avg_scores = []
for i in n_estimators:
    forest = rfc(n_estimators=i, oob_score=True)
    scores = cross_val_score(forest, trainData[0::, 1::], trainData[0::, 0], scoring='log_loss', cv=5, n_jobs=-1)
    avg_scores.append(auxiliary.calc_avg(scores))
    if avg_scores[-1] > best_cv_score:
        best_cv_score = avg_scores[-1]
        best_n_est = i
plt.plot(n_estimators, avg_scores)
plt.show()
'''

forest_v = rfc(n_estimators=100, oob_score=True)
forest_v = AdaBoostClassifier()
forest = forest_v.fit(trainData[0::,1::], trainData[0::,0])

# Feature importances
importances = forest.feature_importances_
print "Feature Importances: ", importances

print 'Predicting...'
output = forest.predict_proba(testData).astype(float)

output = output.tolist()
predictions_file = open("../submissionRF.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["Id",'ARSON','ASSAULT','BAD CHECKS','BRIBERY','BURGLARY','DISORDERLY CONDUCT',
                           'DRIVING UNDER THE INFLUENCE','DRUG/NARCOTIC','DRUNKENNESS','EMBEZZLEMENT','EXTORTION',
def main():
    
    if len(sys.argv) != 3:
        print("Usage: python filename [training_set] [queries]")
    
    print("Hello World")
    
    headings = [
    "ID",
    "age",
    "job",
    "marital",
    "education",
    "default",
    "balance",
    "housing",
    "loan",
    "contact",
    "day",
    "month",
    "duration",
    "campaign",
    "pdays",
    "previous",
    "poutcome",
    "y"
    ]
    
    answerData = []
   
    """Read in data"""
    trainingdata = pandas.read_csv("Data/trainingset.txt",header = None, names = headings)
    queries = pandas.read_csv("Data/queries.txt",header = None, names = headings)



    idnum = [0]
    target = [17]
    cont = [1, 5, 6, 10, 12, 13, 14, 15]
    cat = [2, 3, 4, 7, 8, 9, 11, 16]



    """Learn from data"""
    
    ##Continuous Relevant Data: age, balance, previous
    ##Categorical Relevant Data: job, housing, loan, contact
    ##
    ##INSERT CODE THAT DOES THINGS HERE
    ##Implement Random Forest predictive algorithm
    ##Format data into numerical format
        
    relevantFeatures = ["age","balance","previous","job","housing","loan","contact"]
    model = rfc(n_estimators=1000)
    
    
    #made all data numeric
    length = len(trainingdata.index)
    print(length)
    
    trainingdata = numerify(trainingdata)
        
        
    """Answer Queries"""




    dataHeader = ["ID","Y"]
    answerData.append(dataHeader)


    #the following is test code to get id and target and put it into the answers
    
    for x in range (0, length):
        temp = []
        
        #trainingdata.set_value(x, 'ID', 'bork')
        
        newid = trainingdata.iloc[x]['ID']
        temp.append(newid)
        newtarget = trainingdata.iloc[x]['y']
        temp.append(newtarget)
        answerData.append(temp)
        
#id = trainingdata.iloc[x]['ID']
#newid = job_to_numeric(id)
#trainingdata.set_value(x, 'ID', newid)        

    """Output Queries"""
    #Write all the data from the array into a text file.
    #Each iteration of queries should be written into the answerData list, as lists themselves.
    newfile = open('./data/C12449618+C12474932.txt', 'w')
    writerObject = csv.writer(newfile, lineterminator='\n')
    
    for line in answerData:
        writerObject.writerow(line)
        
    newfile.flush()
    newfile.close()
import sys

from sklearn.cross_validation import KFold as kfold
from sklearn.ensemble import AdaBoostClassifier as rfc

from sklearn.externals import joblib

import numpy as np


if __name__== "__main__":

	train_data = np.loadtxt(sys.argv[1],delimiter =',')
	train_label = np.loadtxt(sys.argv[2],delimiter=',')

	trees_list = [500]

	for i in trees_list:

		est = rfc(n_estimators=i)
		est.fit(train_data, train_label)
		filename = "ada_boost" + str(i) + ".pkl"
		joblib.dump(est, filename)
Exemplo n.º 57
0
testDf = auxiliary.initialise_test(False)
ids = testDf['Id'].values
# Id,Dates,DayOfWeek,PdDistrict,Address,X,Y,Year,Week,Hour
testDf = testDf.drop(['Id', 'Dates', 'Address'], axis=1)

# Random Forest Algorithm
print list(trainDf.columns.values)
print list(testDf.columns.values)
#print list(trainDf.X.values)

# back to numpy format
trainData = trainDf.values
testData = testDf.values

print 'Training...'
forest = rfc(n_estimators=25)
forest = forest.fit(trainData[0::,1::], trainData[0::,0])

print 'Predicting...'
output = forest.predict_proba(testData).astype(float)
output = output.tolist()

predictions_file = open("../submissionRF.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["Id",'ARSON','ASSAULT','BAD CHECKS','BRIBERY','BURGLARY','DISORDERLY CONDUCT',
                           'DRIVING UNDER THE INFLUENCE','DRUG/NARCOTIC','DRUNKENNESS','EMBEZZLEMENT','EXTORTION',
                           'FAMILY OFFENSES','FORGERY/COUNTERFEITING','FRAUD','GAMBLING','KIDNAPPING','LARCENY/THEFT',
                           'LIQUOR LAWS','LOITERING','MISSING PERSON','NON-CRIMINAL','OTHER OFFENSES',
                           'PORNOGRAPHY/OBSCENE MAT','PROSTITUTION','RECOVERED VEHICLE','ROBBERY','RUNAWAY',
                           'SECONDARY CODES','SEX OFFENSES FORCIBLE','SEX OFFENSES NON FORCIBLE','STOLEN PROPERTY',
                           'SUICIDE','SUSPICIOUS OCC','TREA','TRESPASS','VANDALISM','VEHICLE THEFT','WARRANTS',
Exemplo n.º 58
0
lbls = iris_data.target

print dt.shape

    
num_fold = 10
gen_k_sets = StratifiedKFold(lbls,num_fold,shuffle=True)
ab = []



overall_mis = 0
err=[]
c= 1.0
mdl = SVC(C=c,kernel='rbf',degree=1,tol=0.0001)
mdl = rfc(n_estimators=100,criterion='entropy',min_samples_leaf=5,min_samples_split=10,max_features=8)
mdl = knn(n_neighbors=1)
imgsize = 8
patchsize = 6
ab= []
for train_index, test_index in gen_k_sets:   

    train_data, test_data = dt[train_index], dt[test_index]
    train_class, test_class = lbls[train_index], lbls[test_index]
    dtsize= train_data.shape[0]
    train_data = train_data.reshape(dtsize,imgsize,imgsize)
    
    c1 = train_data[:,0:patchsize,0:patchsize] 
    '''
    a= c1[0,:,:]
    print a.shape
Exemplo n.º 59
0
for line in f:
	line = line.strip()
	labels.append(line)
f.close()


#train_data = np.reshape(train_data, (17000,100))
#test_data = np.reshape(test_data,(len(test_data),100))
overall_s = 0
for i in range(0,len(data),len(data)/10):
	#labels = np.array(labels)
	train_data = data[0:i] + data[i+len(data)/10:]
	test_data = data[i:i+len(data)/10]
	train_label = labels[0:i] + labels[i+len(data)/10:]
	test_label = labels[i:i+len(data)/10]
	test_label = np.array(test_label)
	train_label = np.array(train_label)
	train_data = np.array(train_data)
	test_data = np.array(test_data)
	clf = rfc(n_estimators=300)
	y_pred = clf.fit(train_data,train_label).predict(test_data)
	#pickle.dump( y_pred, open( "out	.p", "wb" ) )
	print y_pred
	print test_label
	count = 0
	for i in range(0,len(y_pred)):
		if y_pred[i] == test_label[i]:
			count+=1
	print (float(count)/len(y_pred))*100
	overall_s+=(float(count)/len(y_pred))*100
print overall_s/10