Exemplo n.º 1
0
def GradBoost(X_DS, Y_DS, X_train, X_test, y_train, y_test, Cl_Names = 'None', mask='None',Max_Depth=3):
#******************************************************************************

	from sklearn.ensemble import GradientBoostingClassifier as GBC #import library for machine learning analysis
	from sklearn.metrics import classification_report

	print 'Gradient Boosting: Training...' #notify the user about the status of the process 

	Gradient_Boosting_obj = GBC(max_depth=Max_Depth) #call the Gradient Boosting routine built in
	Gradient_Boosting_obj.fit(X_train, y_train) #fit the logistic model to the train data sets
	Pred_Train = Gradient_Boosting_obj.predict(X_train) #apply the logistic model to the train dataset
	Pred_Test = Gradient_Boosting_obj.predict(X_test) #apply the logistic model to the test dataset

	print 'Gradient Boosting: Completed!' #notify the user about the status of the process

	labels = len(np.unique(Y_DS)) #extract the labels from the classification classes
	Conf_M = np.zeros((labels,labels), dtype='int') #initialize the confusion matrix for the classification problem
	
	if Cl_Names != 'None':
		target_names = Cl_Names
	else:
		target_names = np.arange(len(np.unique(Y_DS))).astype(str).tolist()
	#end

	Conf_M = CM(y_test, Pred_Test,np.unique(Y_DS)) #calls the confusion matrix routine with the test set and prediction set

	print(classification_report(y_test, Pred_Test, target_names=target_names))  #print the performance indicators on the console

	return Gradient_Boosting_obj, Conf_M
def test_mem_layout():
    # Test with different memory layouts of X and y
    X_ = np.asfortranarray(X)
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
    clf.fit(X_, y)
    assert_array_equal(clf.predict(T), true_result)
    assert_equal(100, len(clf.estimators_))

    X_ = np.ascontiguousarray(X)
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
    clf.fit(X_, y)
    assert_array_equal(clf.predict(T), true_result)
    assert_equal(100, len(clf.estimators_))

    y_ = np.asarray(y, dtype=np.int32)
    y_ = np.ascontiguousarray(y_)
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
    clf.fit(X, y_)
    assert_array_equal(clf.predict(T), true_result)
    assert_equal(100, len(clf.estimators_))

    y_ = np.asarray(y, dtype=np.int32)
    y_ = np.asfortranarray(y_)
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
    clf.fit(X, y_)
    assert_array_equal(clf.predict(T), true_result)
    assert_equal(100, len(clf.estimators_))
class GBClassifier:

    def __init__(self):
        """
        Inititalizes the gradient descent classifier
        """
        self.header = "#gbc"
        self.clf = None
        self.learningRate = 0.1
        self.n_estimators = 100
        self.loss = "deviance"
        self.acceptedLossValues = ["deviance", "exponential"]

    def setNumberOfEstimators(self, n_estimators):
        """
        Sets the number of estimators of Gradient Boosting Classifier
        """
        self.n_estimators = n_estimators

    def setLoss(self, loss):
        """
        Sets the loss parameter for the SGDC
        """
        try:
            if loss in self.acceptedLossValues:
                self.loss = loss
            else:
                raise ValueError("Error in input value")
        except Exception as error:
            logging.warning("Error: No such loss value:%s", loss)

    def buildModel(self):
        """
        This builds the model of the Gradient boosting Classifier
        """
        logging.info("Building Model")
        self.clf = GradientBoostingClassifier(loss=self.loss, n_estimators=self.n_estimators,
                     learning_rate = self.learningRate)
        logging.info("Finished Building Model")

    def trainGBC(self,X, Y):
        """
        Training the Gradient Boosting Classifier
        """
        self.clf.fit(X, Y)

    def validateGBC(self,X, Y):
        """
        Validate the Gradient Boosting Classifier
        """
        YPred = self.clf.predict(X)
        print accuracy_score(Y, YPred)

    def testGBC(self,X, Y):
        """
        Test the Gradient Boosting Classifier
        """
        YPred = self.clf.predict(X)
        print accuracy_score(Y, YPred)
Exemplo n.º 4
0
def gbc(train,test,train_target,test_target, lr=.1, n_est=100):
    clf = GradientBoostingClassifier(loss='deviance', learning_rate=lr, n_estimators=n_est)
    clf.fit(train, train_target)
    res = clf.predict(train)
    
    print '*************************** GBC ****************'
    print classification_report(train_target,res)
    
    res1 = clf.predict(test)
    print classification_report(test_target, res1)
    return clf
def test_degenerate_targets():
    """Check if we can fit even though all targets are equal. """
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)

    # classifier should raise exception
    assert_raises(ValueError, clf.fit, X, np.ones(len(X)))

    clf = GradientBoostingRegressor(n_estimators=100, random_state=1)
    clf.fit(X, np.ones(len(X)))
    clf.predict(rng.rand(2))
    assert_array_equal(np.ones((1,), dtype=np.float64), clf.predict(rng.rand(2)))
Exemplo n.º 6
0
def model_color_gboost(X_train, X_test, y_train, y_test):
    # Train the model
    clf = GradientBoostingClassifier(learning_rate=0.1, n_estimators=80, subsample=0.80, max_depth=4)
    clf.fit(tfidf_train, y_train)

    # Check the validity
    pred = clf.predict(tfidf_train.toarray())
    print "Accuracy on train set: ", 100*accuracy_score(pred, y_train)
    pred = clf.predict(tfidf_test.toarray())
    print "Accuracy on validation: ", 100*accuracy_score(pred, y_test)
    print confusion_matrix(y_test, pred, 
                           labels=['press-6', 'press-5', 'press-4', 'press-3', 'press-2', 'press-1'])
Exemplo n.º 7
0
def predict_author(arr, yazar_features, yazar_classes):
    results = []

    print "\n[DEBUG] K-NN result (neighbors: 10)"
    knn = KNeighborsClassifier(n_neighbors=10)
    knn.fit(yazar_features, yazar_classes)
    print knn.predict(arr)
    results.append(knn.predict(arr)[0])

    print "\n[DEBUG] SVC result (linear) (degree=3)"
    svc = svm.SVC(kernel='linear', degree=3)
    svc.fit(yazar_features, yazar_classes)
    print svc.predict(arr)
    results.append(svc.predict(arr)[0])

    print "\n[DEBUG] Logistic Regression result ()"
    regr = linear_model.LogisticRegression()
    regr.fit(yazar_features, yazar_classes)
    print regr.predict(arr)
    results.append(regr.predict(arr)[0])

    print "\n[DEBUG] Gaussian Naive Bayes"
    gnb = GaussianNB()
    gnb.fit(yazar_features, yazar_classes)
    print gnb.predict(arr)
    results.append(gnb.predict(arr)[0])

    print "\n[DEBUG] Decision Tree Classifier"
    dtc = tree.DecisionTreeClassifier()
    dtc.fit(yazar_features, yazar_classes)
    print dtc.predict(arr)
    results.append(dtc.predict(arr)[0])

    print "\n[DEBUG] Gradient Boosting Classification"
    gbc = GradientBoostingClassifier()
    gbc.fit(yazar_features, yazar_classes)
    print gbc.predict(arr)
    results.append(gbc.predict(arr)[0])

    # output = open('features.pkl', 'wb')
    # pickle.dump(yazar_features, output)
    # output.close()

    # output = open('classes.pkl', 'wb')
    # pickle.dump(yazar_classes, output)
    # output.close()

    # test_yazar_features = []        # for test data
    # test_yazar_classes = []         # for test classes
    # # yazar_features = []             # for train data
    # # yazar_classes = []              # for train classes

    return results
Exemplo n.º 8
0
def gradient_boost(x_train, x_test, y_train, 
                                 y_test, rands = None):
    """
    Predict the lemons using a RandomForest and a random seed
    both for the number of features, as well as for the size of the
    sample to train the data on

    ARGS:

        - x_train: :class:`pandas.DataFrame` of the x_training data

        - y_train: :class:`pandas.Series` of the y_training data

        - x_test: :class:`pandas.DataFrame` of the x_testing data

        - y_test: :class:`pandas.Series` of the y_testing data

        - rands: a :class:`tuple` of the (rs, rf) to seed the sample
        and features of the BaggingClassifier.  If `None`, then
        rands are generated and provided in the return `Series`

    RETURNS:

        :class:`pandas.Series` of the f1-scores and random seeds
    """
    #create a dictionary for the return values
    ret_d = {'train-f1':[], 'test-f1':[], 'rs':[], 'rf':[]}

    #use the randoms provided if there are any, otherwise generate them
    if not rands:
        rs =  numpy.random.rand()
        rf = numpy.random.rand()
        while rf < 0.1:
            rf = numpy.random.rand()
    else:
        rs, rf = rands[0], rands[1]
    #place them into the dictionary
    ret_d['rs'], ret_d['rf'] = rs, rf
    #create and run the bagging classifier
    bc = GradientBoostingClassifier(n_estimators = 300,
                                    max_features = rf)
    bc.fit(x_train, y_train)

    y_hat_train = bc.predict(x_train)
    ret_d['train-f1'] = f1_score(y_train, y_hat_train)
    y_hat_test = bc.predict(x_test)
    ret_d['test-f1'] = f1_score(y_test, y_hat_test)
    return pandas.Series(ret_d)
def GB_Classifier(X_train, X_cv, X_test, Y_train,Y_cv,Y_test, Actual_DS):
    print("***************Starting Gradient Boosting***************")
    t0 = time()
    clf = GradientBoostingClassifier(n_estimators=500,learning_rate=0.01)
    clf.fit(X_train, Y_train)
    preds = clf.predict(X_cv)
    score = clf.score(X_cv,Y_cv)

    print("Gradient Boosting - {0:.2f}%".format(100 * score))
    Summary = pd.crosstab(label_enc.inverse_transform(Y_cv), label_enc.inverse_transform(preds),
                      rownames=['actual'], colnames=['preds'])
    Summary['pct'] = (Summary.divide(Summary.sum(axis=1), axis=1)).max(axis=1)*100
    print(Summary)

    #Check with log loss function
    epsilon = 1e-15
    #ll_output = log_loss_func(Y_cv, preds, epsilon)
    preds2 = clf.predict_proba(X_cv)
    ll_output2= log_loss(Y_cv, preds2, eps=1e-15, normalize=True)
    print(ll_output2)

    print("done in %0.3fs" % (time() - t0))

    preds3 = clf.predict_proba(X_test)
    #preds4 = clf.predict_proba((Actual_DS.ix[:,'feat_1':]))
    preds4 = clf.predict_proba(Actual_DS)

    print("***************Ending Gradient Boosting***************")
    return pd.DataFrame(preds2),pd.DataFrame(preds3),pd.DataFrame(preds4)
def train_gbt(filename, color, name):
	'''Train on Gradient Boosted Trees Classifier'''
	# Read data
	data2 = pd.read_csv(filename, encoding="utf")
	X = data2.ix[:, 1:-1]
	y = data2.ix[:, -1]

	# Split into train, validation and test
	X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

	# Define model
	clf1 = GradientBoostingClassifier(learning_rate=0.05, max_depth=5, random_state=42)
	
	# Fit model
	t0 = time()
	clf1.fit(X_train, y_train)
	pred_probas = clf1.predict_proba(X_val)

	predictions = clf1.predict(X_val)
	
	print "Score", clf1.score(X_val, y_val)

	importances = clf1.feature_importances_
	indices = np.argsort(importances)[::-1]
	
	# Metrics & Plotting
	metrics[1, 0] = precision_score(y_val, predictions)
	metrics[1, 1] = recall_score(y_val, predictions)
	metrics[1, 2] = f1_score(y_val, predictions)
	metrics[1, 3] = time() - t0

	fpr_rf, tpr_rf, _ = roc_curve(y_val, predictions)
	plt.plot(fpr_rf, tpr_rf, color=color, label=name)

	return importances, indices
Exemplo n.º 11
0
def gbPredict(LOSS, N_EST, L_RATE, M_DEPT, SUB_S, W_START, N_FOLD, EX_F, TRAIN_DATA_X, TRAIN_DATA_Y, TEST__DATA_X, isProb):
    # feature extraction
    ### clf  = GradientBoostingClassifier(loss=LOSS, n_estimators=N_EST, learning_rate=L_RATE, max_depth=M_DEPT, subsample=SUB_S, warm_start=W_START).fit(TRAIN_DATA_X, TRAIN_DATA_Y)
    ### extA = delFeatMin(clf.feature_importances_, EX_F)
    ### TRAIN_DATA_X = TRAIN_DATA_X[:, extA]
    # k-fold validation
    kf   = KFold(TRAIN_DATA_Y.shape[0], n_folds=N_FOLD)
    tesV = 0.0
    for train_index, test_index in kf:
        X_train, X_test = TRAIN_DATA_X[train_index], TRAIN_DATA_X[test_index]
        y_train, y_test = TRAIN_DATA_Y[train_index], TRAIN_DATA_Y[test_index]
        clf  =  GradientBoostingClassifier(loss=LOSS, n_estimators=N_EST, learning_rate=L_RATE, max_depth=M_DEPT, subsample=SUB_S, warm_start=W_START).fit(X_train, y_train)
        tesK =  1 - clf.score(X_test, y_test)
        tesV += tesK
    eVal = tesV / N_FOLD
    # train all data
    clf  = GradientBoostingClassifier(loss=LOSS, n_estimators=N_EST, learning_rate=L_RATE, max_depth=M_DEPT, subsample=SUB_S, warm_start=W_START).fit(TRAIN_DATA_X, TRAIN_DATA_Y)
    TEST__DATA_X = TEST__DATA_X[:, extA]
    if isProb:
        data = clf.predict_proba(TEST__DATA_X)
    else:
        data = clf.predict(TEST__DATA_X)

    print "Eval =", eVal, "with n_esti =", N_EST, "l_rate =", L_RATE, "m_dep =", M_DEPT, "sub_s =", SUB_S, "ex_num =", EX_F, "and loss is", LOSS

    return (data, eVal)
Exemplo n.º 12
0
def test_staged_predict_proba():
    # Test whether staged predict proba eventually gives
    # the same prediction.
    X, y = datasets.make_hastie_10_2(n_samples=1200,
                                     random_state=1)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]
    clf = GradientBoostingClassifier(n_estimators=20)
    # test raise NotFittedError if not fitted
    assert_raises(NotFittedError, lambda X: np.fromiter(
        clf.staged_predict_proba(X), dtype=np.float64), X_test)

    clf.fit(X_train, y_train)

    # test if prediction for last stage equals ``predict``
    for y_pred in clf.staged_predict(X_test):
        assert_equal(y_test.shape, y_pred.shape)

    assert_array_equal(clf.predict(X_test), y_pred)

    # test if prediction for last stage equals ``predict_proba``
    for staged_proba in clf.staged_predict_proba(X_test):
        assert_equal(y_test.shape[0], staged_proba.shape[0])
        assert_equal(2, staged_proba.shape[1])

    assert_array_almost_equal(clf.predict_proba(X_test), staged_proba)
Exemplo n.º 13
0
def main():
    print("gradient boosting  classifier!")

    X,Y,Xtest = importdata()
    print(Y.shape)
    param_grid={
            "n_estimators":[10,100,200,2000,20000],
            "min_samples_split":[5,10,20,50]
            }

    gb=GradientBoostingClassifier()
    Gridsearch_impl(X,Y,gb,param_grid,5)

#    for i in range(10,11,5):
#        clf = DecisionTreeClassifier(min_samples_split=i)
#        rf = RandomForestClassifier(n_estimators = 100,random_state=0,min_samples_split=i)
#        ab = AdaBoostClassifier(rf,n_estimators = 10)
        #ab = GradientBoostingClassifier(n_estimators = 100)
#        score = cross_validation.cross_val_score(ab,X,Y,cv=3)
      #  print(score)
      #  print("average score %f"%np.mean(score))
      #  print("std %f"%np.std(score))
      #  ab.fit(X,Y)
   


    Ytest = gb.predict(Xtest)
    output(Ytest,'submit3.csv')
def main():
  """
  Use random forests to classify, based on cv results
  """
  from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
  from sklearn.grid_search import GridSearchCV
  from sklearn.preprocessing import StandardScaler
  import sys 

  #call import_data on STDIN, returning formatted query results
  M, N, q, tset, pset = import_data(sys.stdin)
  #create features list so we can easily grab the feature fields
  features = ["F" + str(j) for j in xrange(1, M+1)]
 
  #read in to a pandas dataframe and perform some preprocessing
  training_set = pd.DataFrame(tset).set_index('ID')
  pred_set = pd.DataFrame(pset).set_index('ID')
  
  scale = StandardScaler().fit(training_set[features])
  training_set[features] = scale.transform(training_set[features])
  pred_set[features] = scale.transform(pred_set[features])
  
  #adjust the labeling convention
  training_set['Label'] = training_set['Label'] == "+1"

  grad = GradientBoostingClassifier(n_estimators=1000, learning_rate=1.0, max_depth=1)
  grad.fit(training_set[features], training_set['Label'])

  def print_results(x):
    if x['Pred_Label'] == 1: print x.name + " +1"
    else: print x.name + " -1" 
    
  pred_set['Pred_Label'] = grad.predict(pred_set[features])
  pred_set.apply(print_results, axis=1)
def fit_model():
    DATA_FILE  = './data/train-set-ru-b64-utf-8.txt'
    stats_collector = StatsCollector()
    i=0
    data = []
    target = []

    with open (DATA_FILE) as df:
         for i, line in enumerate(df):
            print i
            line = line.strip()
            parts = line.split()
            stats_collector = StatsCollector()
            stats_collector.collect(int(parts[1]), parts[3], parts[2])
            data.append(stats_collector.get_features())
            target.append(stats_collector.get_target())
            #print len(data[-1])


    data = np.asarray(data, dtype = np.float)
    target = np.asarray(target, dtype = np.float)
    print data.shape, target.shape
    df.close()
    clf = GradientBoostingClassifier(loss='deviance', learning_rate=0.07, n_estimators=300, min_samples_split=30,\
         min_samples_leaf=15, max_depth=4)

    clf.fit(data, target)
    y_pred = clf.predict(data)
    print f1_score(target, y_pred)

    joblib.dump(clf, 'model/model.pkl') 
def classify_survivors(Y = labels, orig_test = test_data):
	X, test = featurizer()

	best_model = {'n_estimators': 20, 'learning_rate': 1.0, 'max_depth': 3}	

	gbt = GradientBoostingClassifier(subsample=0.8, min_samples_leaf=50, min_samples_split=20,
		n_estimators = 20, learning_rate = 1.0, max_depth = 3)

	ID_col = orig_test.loc[:,['PassengerId']]
	print ID_col.ix[0:10]
	gbt.fit(X,Y)
	#print test.ix[0:10]
	predicted_results = gbt.predict(test)
	predicted_results = pd.DataFrame(predicted_results)
	predicted = pd.concat( [ID_col,predicted_results], axis=1 )
	predicted = predicted.rename(columns={0 : 'Survived'})
	#predicted = predicted.drop(' ',axis=1)
	del predicted['']

	#Print some of the dataframe with predictions to test results
	print predicted.ix[0:15],'\n'
	#print X.ix[0:15]

	#Output result dataframe as csv
	predicted.to_csv('predicted_results.csv')
def main():
    print '[INFO, time: %s] Getting Data....' % (time.strftime('%H:%M:%S'))
    testing_file = file('test.p', 'r')
    training_file = file('train.p', 'r')

    train = pickle.load(training_file)
    test = pickle.load(testing_file)

    testing_file.close()
    training_file.close()
    
    trainX = train[:,:-1]
    trainy = train[:,-1]
    
    testX = test[:,:-1]
    testy = test[:,-1]

    print '[INFO, time: %s] Fitting %s ...' % (time.strftime('%H:%M:%S'), 'GradientBoostingClassifier(n_estimators=1000)')
    clf = GradientBoostingClassifier(n_estimators=1000)
    clf.fit(trainX, trainy)

    print '[INFO, time: %s] Making Predictions...' % (time.strftime('%H:%M:%S'))
    prediction = clf.predict(testX)
    print '[RESULT, time: %s] accuracy = %f' % (time.strftime('%H:%M:%S'),accuracy_score(testy, prediction))


    model_save_file = file('gradient_1000.p', 'w')
    pickle.dump(clf, model_save_file)
    model_save_file.close()
    print 'All done'
def cv_model():
    DATA_FILE  = './data/train-set-ru-b64-utf-8.txt'
    all_data = []
    target = []
    with open(DATA_FILE) as df:
        for i, line in enumerate(df):
            print i
            line = line.strip()
            parts = line.split()
            stats_collector = StatsCollector()
            #print parts[2]
            #print base64.b64decode(parts[3])#.decode('utf-8')
            #print parts[2].decode('utf-8'), parts[3].decode('utf-8'), "\n"
            stats_collector.collect(int(parts[1]), parts[3], parts[2])
            # mark page url
            all_data.append(stats_collector.get_features())
            target.append(stats_collector.get_target())
            #print all_data[-1]

    data = np.asarray(all_data, dtype = np.float)
    target = np.asarray(target, dtype = np.float)

    clf = GradientBoostingClassifier(loss='deviance', learning_rate=0.05, n_estimators=400,\
     min_samples_split=30, min_samples_leaf=15, max_depth=5)

    kf = KFold(data.shape[0], n_folds = 3, shuffle = True)

    for train_index, test_index in kf:
        X_train, X_test = data[train_index], data[test_index]
        y_train, y_test = target[train_index], target[test_index]
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print f1_score(y_test, y_pred)
Exemplo n.º 19
0
def mse_sklearn(x_train, x_test, y_train, y_test, n_estimators):
    clf = GradientBoostingClassifier(n_estimators=n_estimators,
                                     min_samples_leaf=MIN_SAMPLES_LEAF,
                                     max_depth=MAX_DEPTH)
    clf.fit(x_train, y_train)
    pred = clf.predict(x_test)
    return f1_score(y_test, pred)
Exemplo n.º 20
0
class Blender(BaseEstimator, ClassifierMixin):
    def __init__(self, trained_clfs):
        self.clfs = trained_clfs
        # self.classifier = make_pipeline(OneHotEncoder(), DenseTransformer(),
        #                                 GradientBoostingClassifier())
        self.classifier = GradientBoostingClassifier()
        # self.classifier = make_pipeline(
        #     OneHotEncoder(), LogisticRegression(class_weight='auto'))

    def fit(self, data, target):
        # self.enc = LabelEncoder().fit(target)
        probs = self.transform_input(data)
        # self.classifier.fit(predictions, target)
        self.classifier.fit(probs, target)

    def predict(self, data):
        predictions = self.transform_input(data)
        return self.classifier.predict(predictions)

    def transform_input(self, data):
        probabilities = [clf.predict_proba(data) for clf in self.clfs]

        probabilities = np.array(probabilities)
        # features, samples = probabilities.shape
        n_clfs, samples, features = probabilities.shape
        probabilities = np.reshape(probabilities, (samples, n_clfs * features))
        probabilities[np.isnan(probabilities)] = 0
        return probabilities
 def get_n_fold_validation_score(self, fold=10):
     features = data.get_features()
     lables = data.get_lables()
     length = len(features)
     jump = length / fold
     index = 0
     k = 0
     scores = list()
     while k < fold:
         feature_test = features.iloc[index : (index + jump), :]
         lable_test = lables.iloc[index : (index + jump), :]
         feature_train_1, feature_train_2 = (
             features.iloc[0 : index - 1, :] if index != 0 else pd.DataFrame(),
             features.iloc[index + jump + 1 : length - 1],
         )
         feature_train = pd.concat([feature_train_1, feature_train_2])
         lable_train_1, lable_train_2 = (
             lables.iloc[0 : index - 1, :] if index != 0 else pd.DataFrame(),
             lables.iloc[index + jump + 1 : length - 1],
         )
         lable_train = pd.concat([lable_train_1, lable_train_2])
         index += jump
         k += 1
         classifier = GradientBoostingClassifier()
         classifier.fit(feature_train, lable_train["lable"].values)
         scores.append(accuracy_score(lable_test, classifier.predict(feature_test)))
     return sum(scores) / float(len(scores))
Exemplo n.º 22
0
def final_run(X,Y,Xtest,n_est):
    clf = GradientBoostingClassifier(n_estimators=n_est,random_state=n_est)
    clf = clf.fit(X,Y)
    #np.savetxt('gb_oob_improve_{}'.format(n_est),clf.oob_score_)
    #np.savetxt('gb_train_score_{}'.format(n_est),clf.train_score_)
    Ytest=clf.predict(Xtest)
    output(Ytest,'gradient_boost_{}.csv'.format(n_est))
Exemplo n.º 23
0
def classify2(dis_data, numeric_data, t_label):
    fold = 5
    skf = StratifiedKFold(t_label, fold)
    roc_auc = 0  
    f1_score_value = 0

    clf1 = LogisticRegression()
    clf2 = GradientBoostingClassifier()
#    clf3 = tree.DecisionTreeClassifier(max_depth=500, max_leaf_nodes= 500, class_weight={1:12})
    clf3 = GradientBoostingClassifier()
    
    for train, test in skf:
        clf3 = clf3.fit(dis_data.iloc[train], t_label.iloc[train])
        
        #compute auc
        probas_  = clf3.predict_proba(dis_data.iloc[test])
        fpr, tpr, thresholds = roc_curve(t_label.iloc[test], probas_[:, 0])
        roc_auc += auc(fpr, tpr)    
        
        #compute f1_score
        label_pred = clf3.predict(dis_data.iloc[test])
        
        f1_score_value += f1_score(t_label.iloc[test], label_pred, pos_label= 1)
        
    return roc_auc / fold, f1_score_value / fold     
Exemplo n.º 24
0
class MyGradientBoosting(MyClassifier):
    def __init__(self):
        self.gradient_boosting = None

    def train(self, data_path='data/train.pkl', n_estimators=10, learning_rate=0.1):
        labels, instances = load_pickled_dataset(data_path)
        start_time = time.clock()
        self.gradient_boosting = GradientBoostingClassifier(loss='deviance', learning_rate=learning_rate,
                                                            n_estimators=n_estimators, subsample=0.3,
                                                            min_samples_split=2,
                                                            min_samples_leaf=1,
                                                            max_depth=3,
                                                            init=None,
                                                            random_state=None,
                                                            max_features=None,
                                                            verbose=2)
        self.gradient_boosting.fit(instances, labels)
        end_time = time.clock()
        print "STATUS: model training done. elapsed time - %d seconds" % (end_time - start_time)
        print "INFO: " + str(self.gradient_boosting)

    def predict(self, data_path='data/test.pkl'):
        labels, instances = load_pickled_dataset(data_path)
        return self.gradient_boosting.predict(instances)

    def save(self, file_path='model/gbc_model'):
        joblib.dump(self.gradient_boosting, file_path)

    def load(self, file_path='model/gbc_model'):
        self.gradient_boosting = joblib.load(file_path)

    def write_results(self, predictions):
        super(MyGradientBoosting, self).write(predictions, 'gbc_prediction.csv')
Exemplo n.º 25
0
def plotLearningCurve(dat,lab,optim):

    '''
    This function plots the learning curve for the classifier

    Parameters:
    -----------
    dat: numpy array with all records
    lab: numpay array with class labels of all records
    optim: optimal parameters for classifier

    '''

    clf = GradientBoostingClassifier(learning_rate = optim[0], subsample = optim[1])

    # split training data into train and test (already chose optimal parameters)
    xTrain, xTest, yTrain, yTest = cross_validation.train_test_split(dat, lab, 
                                                                     test_size = 0.3)

    # choose various sizes of training set to model on to generate learning curve
    szV = range(10, np.shape(xTrain)[0], int(np.shape(xTrain)[0]) / 10)
    szV.append(np.shape(xTrain)[0])

    LCvals = np.zeros((len(szV),3), dtype = np.float64) # store data points of learning curve
    for i in xrange(0, len(szV)):
        clf = clf.fit(xTrain[:szV[i],:], yTrain[:szV[i]])
        LCvals[i,0] = szV[i]
        LCvals[i,1] = clf.score(xTest, yTest)
        LCvals[i,2] = clf.score(xTrain[:szV[i],:], yTrain[:szV[i]])

    #print LCvals

    # generate figure
    fig = plt.figure(1, figsize = (10,10))
    prop = matplotlib.font_manager.FontProperties(size=15.5)
    ax = fig.add_subplot(1, 1, 1)
    ax.plot(LCvals[:,0] / np.float64(np.shape(xTrain)[0]), 1.0 - LCvals[:,1], 
            label = 'Test Set')
    ax.plot(LCvals[:,0] / np.float64(np.shape(xTrain)[0]), 1.0 - LCvals[:,2],
            label = 'Training Set')
    ax.set_ylabel(r"Error", fontsize = 20)
    ax.set_xlabel(r"% of Training Set Used", fontsize = 20)
    ax.axis([0.0, 1.0, -0.1, 0.5])
    plt.legend(loc = 'upper right', prop = prop)
    plt.savefig('LC_GB.pdf', bbox_inches = 'tight')
    fig.clear()

    # where is model failing?
    
    predProb = clf.predict_proba(xTest)
    tmp = np.zeros((np.shape(predProb)[0], np.shape(predProb)[1] + 2))
    tmp[:,:-2] = predProb
    tmp[:,-2] = clf.predict(xTest)
    tmp[:,-1] = yTest
    mask = tmp[:,-2] != tmp[:,-1]
    print tmp[mask]
    print mask.sum(), len(xTest)
    
    print tmp[:50,:]
Exemplo n.º 26
0
    def rand_forest_train(self):
        # 读取本地用户特征信息
        users = pd.read_csv('names.csv')
        # 选取similarity、platform、reputation、entropy作为判别人类或机器的特征
        X = users[['similarity', 'platform', 'reputation', 'entropy']]
        y = users['human_or_machine']

        # 对原始数据进行分割, 25%的数据用于测试
        from sklearn.cross_validation import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)

        # 对类别特征进行转化,成为特征向量
        from sklearn.feature_extraction import DictVectorizer
        vec = DictVectorizer(sparse=False)
        X_train = vec.fit_transform(X_train.to_dict(orient='record'))
        X_test = vec.transform(X_test.to_dict(orient='record'))

        # 使用单一决策树进行集成模型的训练及预测分析
        from sklearn.tree import DecisionTreeClassifier
        dtc = DecisionTreeClassifier()
        dtc.fit(X_train, y_train)
        dtc_y_pred = dtc.predict(X_test)

        # 使用随机森林分类器进行集成模型的训练及预测分析
        from sklearn.ensemble import RandomForestClassifier
        rfc = RandomForestClassifier()
        rfc.fit(X_train, y_train)
        rfc_y_pred = rfc.predict(X_test)

        # 使用梯度提升决策树进行集成模型的训练及预测分析
        from sklearn.ensemble import GradientBoostingClassifier
        gbc = GradientBoostingClassifier()
        gbc.fit(X_train, y_train)
        gbc_y_pred = gbc.predict(X_test)

        from sklearn.metrics import classification_report
        # 输出单一决策树在测试集上的分类准确性, 以及更加详细的精确率 召回率 F1指标
        print("单一决策树的准确性为", dtc.score(X_test, y_test))
        print(classification_report(dtc_y_pred, y_test))

        # 输出随机森林分类器在测试集上的分类准确性,以及更加详细的精确率 召回率 F1指标
        print("随机森林分类器的准确性为", rfc.score(X_test, y_test))
        print(classification_report(rfc_y_pred, y_test))

        # 输出梯度提升决策树在测试集上的分类准确性,以及更加详细的精确率 召回率 F1指标
        print("梯度提升决策树的准确性为", gbc.score(X_test, y_test))
        print(classification_report(gbc_y_pred, y_test))


        users = pd.read_csv('values.csv')

        # 检验是否为机器或人类
        X = users[['similarity', 'platform', 'reputation', 'entropy']]
        X = vec.transform(X.to_dict(orient='record'))
        print(rfc.predict(X))

        self.dtc = dtc
        self.rfc = rfc
        self.gbc = gbc
Exemplo n.º 27
0
class Gdbc1Model:
    def __init__(self):
        self.model = GradientBoostingClassifier(max_features=0.6, learning_rate=0.05, max_depth=5, n_estimators=300)

    def fit(self,x,y):
        self.model.fit(x,y)
    def predict(self,X):
        return self.model.predict(X)
Exemplo n.º 28
0
def GradientBoosting():
    #import libraries
    from sklearn.ensemble import GradientBoostingClassifier #for classification
    from sklearn.ensemble import GradientBoostingRegressor #for regression
    #use GBM function
    clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
    clf.fit(X_train, y_train)
    predicted = clf.predict(X_test)
Exemplo n.º 29
0
def gradient_boosting_classify(my_train_data, my_train_label, my_test_data, estimators):
    clf = GradientBoostingClassifier(n_estimators=estimators)
    scores = cross_validation.cross_val_score(clf, my_train_data, my_train_label, cv=5)
    print("gradient boosting(%d) accuracy: %0.3f (+/- %0.3f)" % (estimators, scores.mean(), scores.std() * 2))
    clf.fit(my_train_data, my_train_label)
    my_test_label = clf.predict(my_test_data)
    file_name = "gradient_boosting_%d.csv" % estimators
    data_storer.save_data(my_test_label, file_name)
Exemplo n.º 30
0
def gradient_boosting_classifier(x_train, y_train, x_test, y_test, num_tree):
    model = Gbc(loss='deviance', learning_rate=0.2, n_estimators=num_tree, subsample=1.0, min_samples_split=2,
                min_samples_leaf=10, min_weight_fraction_leaf=0.0, max_depth=5, init=None, random_state=None,
                max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False)
    model.fit(x_train, y_train)
    expected = y_test
    predicted = model.predict(x_test)
    return expected, predicted
    print(msg)

# GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier selected

######## Run a basic GradientBoostingClassifier ########

GBC = GradientBoostingClassifier(random_state=10)

# Scaling
steps = [('scaler', scaler), ('GBC', GBC)]
#Pipelining
pipeline = Pipeline(steps)

GBC.fit(X_train, y_train)

y_pred = GBC.predict(X_test)
y_predict_prob = GBC.predict_proba(X_test)[:, 1]

cm = confusion_matrix(y_test, y_pred)
print(cm)

# Print accuracy of the model
print('Score: {}'.format(accuracy_score(y_pred, y_test)))

# Generate ROC curve values: fpr, tpr, thresholds
fpr_gbc, tpr_gbc, thresholds_gbc = roc_curve(y_test, y_predict_prob)
auc_gbc = roc_auc_score(y_test, y_pred)
print("AUC: ", auc_gbc)

#Score: 0.7875964036619049
#AUC:  0.7886511356295131
Exemplo n.º 32
0
x_train, x_test, y_train, y_test = train_test_split(df_cp,
                                                    train_Y,
                                                    test_size=0.25,
                                                    random_state=4)

########## model start
from sklearn.ensemble import GradientBoostingClassifier

gdbt = GradientBoostingClassifier(learning_rate=0.01)

# 訓練模型
gdbt.fit(x_train, y_train)

# 預測測試集
y_pred = gdbt.predict(x_test)

y_pred_proba = gdbt.predict_proba(x_test)[:, 1]

########## model end

########## 糢型憑估  start
from sklearn import datasets, metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

check_view = pd.DataFrame({'pred_poi': y_pred_proba, 'poi': y_test})
check_view = check_view.sort_values(by=['pred_poi'])

acc = accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)
Exemplo n.º 33
0
def main():

    clf = GradientBoostingClassifier(n_estimators=1000,
                                     min_samples_split=15,
                                     learning_rate=0.1,
                                     max_depth=160)
    feature_set_id = '59'

    feature_sets_file = args.feature_sets_log_file

    feature_set_dict = {}
    with open(feature_sets_file, 'r') as stream:
        feature_set_dict = yaml.load(stream)

    feature_set = feature_set_dict[feature_set_id]

    engine = model_pipeline_script.get_engine()
    con = engine.connect()

    if args.prediction_table != '':
        contract_flag = True
    else:
        contract_flag = False

    contracts_data = pd.read_sql(args.training_table, engine)
    if contract_flag:
        prediction_data = pd.read_sql(args.prediction_table, engine)

    print contracts_data.columns
    #proccess training data
    contracts_data['amt_standardized'] = contracts_data['amount_standardized']
    contracts_data['contract_signing_date'] = pd.to_datetime(
        contracts_data['contract_signing_date'])
    #Subsetting on only main allegation outcomes
    train_data = contracts_data[
        (contracts_data['allegation_outcome'] == 'Substantiated') |
        (contracts_data['allegation_outcome'] == 'Unfounded') |
        (contracts_data['allegation_outcome'] == 'Unsubstantiated')]

    train_data, col_group_dict_train = model_pipeline_script.join_features(
        engine, con, contracts_data, args.train_table_id)
    col_group_dict_train, col_group_keys_train = model_pipeline_script.define_feature_sets(
        col_group_dict_train)

    if contract_flag:
        #process prediction data
        prediction_data['amt_standardized'] = prediction_data[
            'amount_standardized']
        prediction_data['contract_signing_date'] = pd.to_datetime(
            prediction_data['contract_signing_date'])
        prediction_data['allegation_category'] = args.allegation_category

        prediction_data, col_group_dict_predict = model_pipeline_script.join_features(
            engine, con, prediction_data, args.predict_table_id)
        col_group_dict_predict, col_group_keys_predict = model_pipeline_script.define_feature_sets(
            col_group_dict_predict)

    train_df = train_data[train_data['allegation_outcome'].notnull()]
    if not contract_flag:
        predict_df = train_data[train_data['allegation_outcome'].isnull()]

        predict_df.drop('allegation_outcome', 1, inplace=True)
    else:
        predict_df = prediction_data

    feature_set_new = []
    for feat_set in feature_set:
        if 'cntrcts_splr_ftr_set_train' in feat_set:
            feat_set = feat_set.replace(
                'cntrcts_splr_ftr_set_train',
                'cntrcts_splr_ftr_set_' + args.train_table_id)
        feature_set_new.append(feat_set)
    feature_set = feature_set_new

    df_features_train, y_train = model_pipeline_script.select_features(
        train_df, col_group_dict_train, feature_set)

    print 'feat_sets:'
    if args.predict_table_id != '':
        feature_set_new = []
        for feat_set in feature_set:
            print feat_set
            if 'cntrcts_splr_ftr_set_' + args.train_table_id in feat_set:
                feat_set = feat_set.replace(
                    'cntrcts_splr_ftr_set_' + args.train_table_id,
                    'cntrcts_splr_ftr_set_' + args.predict_table_id)
            feature_set_new.append(feat_set)
        feature_set = feature_set_new

    print 'shape: '
    print predict_df.shape, feature_set
    if contract_flag:
        df_features_predict, y_predict = model_pipeline_script.select_features(
            predict_df, col_group_dict_predict, feature_set)
    else:
        df_features_predict, y_predict = model_pipeline_script.select_features(
            predict_df, col_group_dict_train, feature_set)
    print df_features_predict.shape

    df_to_write = df_features_train.merge(pd.DataFrame(y_train),
                                          left_index=True,
                                          right_index=True)
    df_to_write.to_csv('features_and_outcomes.csv')

    matching_cols = [
        val for val in df_features_train.columns
        if val in set(df_features_predict.columns)
    ]
    print len(matching_cols), len(df_features_train.columns), len(
        df_features_predict.columns)

    df_features_train = df_features_train[matching_cols]
    df_features_predict = df_features_predict[matching_cols]

    x_train = np.array(df_features_train)
    y_train = np.array(y_train)
    x_train = x_train.astype(float)

    x_predict = np.array(df_features_predict)
    x_predict = x_predict.astype(float)

    print 'Fitting....'
    clf.fit(x_train, y_train)

    print 'Predicting...'
    y_pred = clf.predict(x_predict)
    y_proba = clf.predict_proba(x_predict).T[1]

    #code for printing out top features
    #try:

    #    print 'Feature importance...'
    #        print df_features_train.columns,df_features_train.shape
    #   top_features = model_pipeline_script.get_feature_importance(clf,x_train,y_train,df_features_train.columns,nfeatures=50)
    #  print top_features
    #feat_idx = []
    #for feat in top_features:
    #    print feat
    #    idx =

    #        model_pipeline_script.decision_surface_plot(clf,df_features_train,y_train,top_features)

    # except IOError:
    #    ''

    #code for plotting distribution of prediction scores
    # plt.hist(y_proba,bins=30)
    # if contract_flag:
    #     plt.title('Prediction Scores on Contracts')
    # else:
    #     plt.title('Prediction Scores on Uninvestigated Complaints')
    # plt.xlabel('Prediction Score')
    # if contract_flag:
    #     plt.ylabel('Number of Contracts')
    # else:
    #     plt.ylabel('Number of Complaints')
    # plt.show()

    prediction_data = predict_df
    prediction_data['prediction_score'] = y_proba

    grouped = prediction_data[[
        'country', 'prediction_score'
    ]].groupby('country').aggregate(['mean', 'median', 'std', 'count'])

    grouped.columns = [' '.join(col).strip() for col in grouped.columns.values]
    #    print prediction_data.columns

    #    prediction_data[['country','prediction_score']].to_sql('prediction_scores_complaints_by_country_nocountryfeatures',engine,if_exists='replace')
    if contract_flag:
        output_df = prediction_data[[
            'wb_contract_number', 'fiscal_year', 'region', 'country',
            'project_id', 'project_name', 'contract_description', 'supplier',
            'borrower_contract_reference_number', 'amount', 'prediction_score'
        ]]
    else:
        output_df = prediction_data[[
            'wb_contract_number', 'fiscal_year', 'region', 'country',
            'project_id', 'project_name', 'contract_description', 'supplier',
            'borrower_contract_reference_number', 'amount',
            'allegation_category', 'prediction_score'
        ]]

    if '.csv' not in args.output_file:
        output_file = args.output_file + '.csv'
        output_table = args.output_file
    else:
        output_file = args.output_file
        output_table = re.sub(r'\.csv$', '', args.output_file)
        output_table_array = output_table.split("/")
        print output_table_array
        output_table = output_table_array[len(output_table_array) - 1]
    output_df.to_csv(output_file, encoding='utf-8')

    if len(output_table) > 63:
        output_table = output_table[:63]

    output_df.to_sql(output_table, engine, if_exists='replace')
Exemplo n.º 34
0
# Training and testing
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

docs_train, docs_test, y_train, y_test = train_test_split(
    twitter_tfidf,
    twitter_train['rank'],
    train_size=.90,
    test_size=.1,
    random_state=685)

from sklearn.ensemble import GradientBoostingClassifier, RandomForestRegressor

clf = GradientBoostingClassifier().fit(docs_train, y_train)

y_pred = clf.predict(docs_test)
stop = timeit.default_timer()

print('Time: ', stop - start)
# from joblib import dump, load
# dump(clf, 'twitchsentiment.chatmodel')
print(sklearn.metrics.accuracy_score(y_test, y_pred))

#Testing

import csv
reviews_new = []

with open("../scrapedchat/a_seagull.csv", 'r', encoding='utf8') as csvFile:
    reader = csv.reader(csvFile)
    for row in reader:
df["Cabin"].fillna("N", inplace=True)
df["Embarked"].fillna("N", inplace=True)
titanic_encode(df, ["Cabin", "Embarked", "Sex", "Ticket"])

df.loc[df["Age"].isnull(), "Age"] = df["Age"].mean()
df = df.drop(["Name", "Ticket"], axis=1)

input_data = df.drop(["Survived"], axis=1)
output_data = df["Survived"]

model = GradientBoostingClassifier()
model.fit(input_data, output_data)

test = pd.read_csv("test.csv")

test["Cabin"].fillna("N", inplace=True)
test["Embarked"].fillna("N", inplace=True)

titanic_encode(test, ["Cabin", "Embarked", "Sex", "Ticket"])

test.loc[test["Age"].isnull(), "Age"] = test["Age"].mean()
test.loc[test["Fare"].isnull(), "Fare"] = test["Fare"].mean()
test = test.drop(["Name", "Ticket"], axis=1)
print(model.predict(test))

submit = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": model.predict(test)
})
submit.to_csv("submit.csv", index=False)
Exemplo n.º 36
0
class GradientBoostingClassifier:
    def __init__(self, loss, learning_rate, n_estimators, subsample,
                 min_samples_split, min_samples_leaf,
                 min_weight_fraction_leaf, max_depth, criterion, max_features,
                 max_leaf_nodes, min_impurity_decrease, random_state=None,
                 verbose=0, **kwargs):
        self.loss = loss
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
        self.subsample = subsample
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.max_depth = max_depth
        self.criterion = criterion
        self.max_features = max_features
        self.max_leaf_nodes = max_leaf_nodes
        self.min_impurity_decrease = min_impurity_decrease
        self.random_state = random_state
        self.verbose = verbose
        self.estimator = None
        self.fully_fit_ = False

    def fit(self, X, y, sample_weight=None):
        from sklearn.ensemble import GradientBoostingClassifier

        # Special fix for gradient boosting!
        if isinstance(X, np.ndarray):
            X = np.ascontiguousarray(X, dtype=X.dtype)

        if self.estimator is None:
            self.learning_rate = float(self.learning_rate)
            self.n_estimators = int(self.n_estimators)
            self.subsample = float(self.subsample)
            self.min_samples_split = int(self.min_samples_split)
            self.min_samples_leaf = int(self.min_samples_leaf)
            self.min_weight_fraction_leaf = float(self.min_weight_fraction_leaf)
            if check_none(self.max_depth):
                self.max_depth = None
            else:
                self.max_depth = int(self.max_depth)
            self.max_features = float(self.max_features)
            if check_none(self.max_leaf_nodes):
                self.max_leaf_nodes = None
            else:
                self.max_leaf_nodes = int(self.max_leaf_nodes)
            self.min_impurity_decrease = float(self.min_impurity_decrease)
            self.verbose = int(self.verbose)

            self.estimator = GradientBoostingClassifier(
                loss=self.loss,
                learning_rate=self.learning_rate,
                n_estimators=self.n_estimators,
                subsample=self.subsample,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                min_weight_fraction_leaf=self.min_weight_fraction_leaf,
                max_depth=self.max_depth,
                criterion=self.criterion,
                max_features=self.max_features,
                max_leaf_nodes=self.max_leaf_nodes,
                random_state=self.random_state,
                verbose=self.verbose,
                warm_start=True,
            )

        self.estimator.fit(X, y, sample_weight=sample_weight)

        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        return not len(self.estimator.estimators_) < self.n_estimators

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError
        return self.estimator.predict(X)

    @staticmethod
    def get_cs():
        cs = ConfigurationSpace()
        loss = Constant("loss", "deviance")
        learning_rate = UniformFloatHyperparameter(
            name="learning_rate", lower=0.01, upper=1, default_value=0.1, log=True)
        # n_estimators = UniformIntegerHyperparameter(
        #     "n_estimators", 50, 500, default_value=100)
        n_estimators = Constant("n_estimators", 100)
        max_depth = UniformIntegerHyperparameter(
            name="max_depth", lower=1, upper=8, default_value=3)
        criterion = CategoricalHyperparameter(
            'criterion', ['friedman_mse', 'mse'],
            default_value='mse')
        min_samples_split = UniformIntegerHyperparameter(
            name="min_samples_split", lower=2, upper=20, default_value=2)
        min_samples_leaf = UniformIntegerHyperparameter(
            name="min_samples_leaf", lower=1, upper=20, default_value=1)
        min_weight_fraction_leaf = UnParametrizedHyperparameter("min_weight_fraction_leaf", 0.)
        subsample = UniformFloatHyperparameter(
            name="subsample", lower=0.01, upper=1.0, default_value=1.0)
        max_features = UniformFloatHyperparameter(
            "max_features", 0.1, 1.0, default_value=1)
        max_leaf_nodes = UnParametrizedHyperparameter(
            name="max_leaf_nodes", value="None")
        min_impurity_decrease = UnParametrizedHyperparameter(
            name='min_impurity_decrease', value=0.0)
        cs.add_hyperparameters([loss, learning_rate, n_estimators, max_depth,
                                criterion, min_samples_split, min_samples_leaf,
                                min_weight_fraction_leaf, subsample,
                                max_features, max_leaf_nodes,
                                min_impurity_decrease])
        return cs
Exemplo n.º 37
0
print('')

#随机森林
print('随机森林:')
rfc = RandomForestClassifier(random_state=2018)
rfc.fit(X_train_std,y_train)
rfc_predict = rfc.predict(X_test_std)
rfc_predict_proba = rfc.predict_proba(X_test_std)[:,1]
get_scores(y_test,rfc_predict,rfc_predict_proba)
print('')

#GBDT
print('GBDT:')
gdbt = GradientBoostingClassifier(random_state=2018)
gdbt.fit(X_train_std,y_train)
gdbt_predict = gdbt.predict(X_test_std)
gdbt_predict_proba = gdbt.predict_proba(X_test_std)[:,1]
get_scores(y_test,gdbt_predict,gdbt_predict_proba)
print('')

#XGBoost
print('XGBoost:')
xgbs = XGBClassifier(random_state=2018)
xgbs.fit(X_train_std,y_train)
xgbs_predict = xgbs.predict(X_test_std)
xgbs_predict_proba = xgbs.predict_proba(X_test_std)[:,1]
get_scores(y_test,xgbs_predict,xgbs_predict_proba)
print('')

#LightGBM
print('LightGBM:')
Exemplo n.º 38
0
                                                      train_file[1],
                                                      normalize=False)
    trainX, validX, trainY, validY = utils.train_test_split(
        trainX, trainY, 0.1)
    print(
        f'\033[32;1mtrainX: {trainX.shape}, trainY: {trainY.shape}, validX: {validX.shape}, validY: {validY.shape}\033[0m'
    )
    if training:
        model = GradientBoostingClassifier(
            learning_rate=0.1,
            n_estimators=200,
            max_depth=3,
            random_state=880301)  #, n_iter_no_change=10, tol=1e-4)
        model.fit(trainX, trainY.ravel())
        utils.save_model(model_path, model)
        #a = model.feature_importances_[1:].reshape(-1, 9)
        #for i in a:
        #    print(('%.3f '*9) % tuple(i))
    else:
        model = utils.load_model(model_path)

    if test:
        testX = utils.load_test_data(test[0], mean, std)
        utils.generate_csv(model.predict(testX), test[1])
    else:
        print(
            f'\033[32;1mTraining score: {model.score(trainX, trainY)}\033[0m')
        print(
            f'\033[32;1mValidaiton score: {model.score(validX, validY)}\033[0m'
        )
Exemplo n.º 39
0
def GBC(X_train, Y_train, X_test):
	clf = GradientBoostingClassifier()
	clf.fit(X_train, Y_train)
	pre = clf.predict(X_test.toarray())
	return pre
Exemplo n.º 40
0
    x = data.iloc[:, 1:]
    y = data['speed']

    # encode string values as integer
    x = encodeData(x)
    y = mapSpeed(y)

    if training:
        x = predictInjSeverity(x)

    return x, y


if __name__ == '__main__':
    # if len(sys.argv) != 3:
    #     print("Bad argument list, enter in following form:")
    #     print("python <script_name>.py <train_set_path> <test_set_path>")
    #     exit()
    # X_train, y_train = read(sys.argv[1], True)
    # X_test, y_test  = read(sys.argv[2])

    X_train, y_train = read("./resources/train.csv", True)
    X_test, y_test = read("./resources/z4_test.csv")

    clf = GradientBoostingClassifier(n_estimators=100)

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='micro')
    print(f1)
Exemplo n.º 41
0

gb_GS.best_estimator_
gbcs=GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=20,
                           max_features=20, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=2, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
gbcs.fit(X_train, y_train)
gbcs.score(X_test, y_test) #0.9891213389121339
predicted=gbcs.predict(X_test)
{'learning_rate': 0.1,
 'max_depth': 20,
 'max_features': 20,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 100}

pred_probs = gbcs.predict_proba(X_test)[:,1]

threshold = 0.5
predicted = pred_probs >= threshold

accuracy = accuracy_score(y_test, predicted)
precision = precision_score(y_test, predicted)
recall = recall_score(y_test, predicted)
Exemplo n.º 42
0
dt_predictions = dt.predict(X_test)

dt_data = pd.read_csv('test.csv')
dt_data.insert((dt_data.shape[1]),'Survived',dt_predictions)

dt_data.to_csv('Titanic_DecisionTrees.csv')

"""
Gradient Boost
"""
# Instantiate our model
gb = GradientBoostingClassifier()
gb.fit(X_train, Y_train)

gb_predictions = gb.predict(X_test)

gb_data = pd.read_csv('test.csv')
gb_data.insert((gb_data.shape[1]),'Survived',gb_predictions)

gb_data.to_csv('Titanic_GradientBoost.csv')

"""
XGBoost
"""
# Instantiate our model
xg = XGBClassifier(learning_rate=0.02, n_estimators=750,
                   max_depth= 3, min_child_weight= 1, 
                   colsample_bytree= 0.6, gamma= 0.0, 
                   reg_alpha= 0.001, subsample= 0.8
                  )
# print ("precision" , "recall", "fscore", "support")
# print ("0 unrelated: ",precision_recall_fscore_support(y_test, y_Pred, average='micro', labels=[0]))
# print ("1 agree: ",precision_recall_fscore_support(y_test, y_Pred, average='micro', labels=[1]))
# print ("2 disagree: ",precision_recall_fscore_support(y_test, y_Pred, average='micro', labels=[2]))
# print ("3 discuss: ",precision_recall_fscore_support(y_test, y_Pred, average='micro', labels=[3]))

### -------------------LogisticRegression-------------------
# LogisticRegression = LogisticRegression()
# LogisticRegression.fit(X_train,y_train)
# y_Pred = LogisticRegression.predict(X_test)

# print ("LogisticRegression")
# print ("accuracy:",LogisticRegression.score(X_test,y_test))
# print ("confusion_matrix:\n",confusion_matrix(y_test, y_Pred))
# print ("precision" , "recall", "fscore", "support")
# print ("0 unrelated: ",precision_recall_fscore_support(y_test, y_Pred, average='micro', labels=[0]))
# print ("1 related: ",precision_recall_fscore_support(y_test, y_Pred, average='micro', labels=[1]))

## -------------------- GradientBoosting --------------------
GradientBoosting = GradientBoostingClassifier()
GradientBoosting.fit(X_train,y_train)
y_Pred = GradientBoosting.predict(X_test)

print ("GradientBoosting")
print ("accuracy:",GradientBoosting.score(X_test,y_test))
print ("confusion_matrix:\n",confusion_matrix(y_test, y_Pred))
print ("precision" , "recall", "fscore", "support")
print ("0 unrelated: ",precision_recall_fscore_support(y_test, y_Pred, average='micro', labels=[0]))
print ("1 related: ",precision_recall_fscore_support(y_test, y_Pred, average='micro', labels=[1]))
print ("2 disagree: ",precision_recall_fscore_support(y_test, y_Pred, average='micro', labels=[2]))
print ("3 discuss: ",precision_recall_fscore_support(y_test, y_Pred, average='micro', labels=[3]))
Exemplo n.º 44
0
    labels_test = []
    for ii in train_idx:
        features_train.append(features[ii])
        labels_train.append(labels[ii])
    for jj in test_idx:
        features_test.append(features[jj])
        labels_test.append(labels[jj])

    clf.fit(features_train, labels_train)
    clf2.fit(features_train, labels_train)
    clf3.fit(features_train, labels_train)
    clf4.fit(features_train, labels_train)
    clfvote.fit(features_train, labels_train)

    predictions1 = clf.predict(features_test)
    predictions2 = clf2.predict(features_test)
    predictions3 = clf3.predict(features_test)
    predictions4 = clf4.predict(features_test)
    predictions = clfvote.predict(features_test)

    clf_f1.append(f1_score(labels_test, predictions1))
    clf2_f1.append(f1_score(labels_test, predictions2))
    clf3_f1.append(f1_score(labels_test, predictions3))
    clf4_f1.append(f1_score(labels_test, predictions4))
    clfvote_f1.append(f1_score(labels_test, predictions))

    # Added after GaussianNB() known to be best clf to evaluate
    for prediction, truth in zip(predictions1, labels_test):
        if prediction == 0 and truth == 0:
            true_negatives += 1
        elif prediction == 0 and truth == 1:
Exemplo n.º 45
0
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Average CV score on the training set was:0.8292963321682738
exported_pipeline = GradientBoostingClassifier(learning_rate=0.5,
                                               max_depth=4,
                                               max_features=0.05,
                                               min_samples_leaf=8,
                                               min_samples_split=12,
                                               n_estimators=100,
                                               subsample=0.9500000000000001)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 46
0
import numpy as np
import pandas as pd

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR')
training_indices, testing_indices = train_test_split(
    tpot_data.index,
    stratify=tpot_data['class'].values,
    train_size=0.75,
    test_size=0.25)

result1 = tpot_data.copy()

# Perform classification with a gradient boosting classifier
gbc1 = GradientBoostingClassifier(learning_rate=0.49,
                                  max_features=1.0,
                                  min_weight_fraction_leaf=0.09,
                                  n_estimators=500,
                                  random_state=42)
gbc1.fit(result1.loc[training_indices].drop('class', axis=1).values,
         result1.loc[training_indices, 'class'].values)

result1['gbc1-classification'] = gbc1.predict(
    result1.drop('class', axis=1).values)
Exemplo n.º 47
0

def pca(x, n_feature):
    mean_x = np.mean(x, 0)
    x -= mean_x
    eig, vec = np.linalg.eig(np.dot(x.T, x))
    idx = np.argsort(-eig)
    W = vec[:, idx[:n_feature]]
    new_x = np.dot(x, W)
    return new_x


bone_data = pd.read_csv('all_bone_info_df.csv')
features_list = list(bone_data.columns)[1:]
features_list.remove('class_id')
features_list.remove('target')

x = bone_data[features_list]
y = bone_data[['target']]
PCA_x = pca(x.values, 10)

x_train, x_test, y_train, y_test = train_test_split(PCA_x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1)
gbdt = GradientBoostingClassifier(random_state=3)
gbdt.fit(x_train, y_train)
y_pred = gbdt.predict(x_test)
# print(y_pred.dtype, y_test.values.dtype)
print("accuracy: %.4g" % (metrics.accuracy_score(y_test, y_pred)))
print(len(features_list))
Exemplo n.º 48
0
# Load data
iris_dataset = load_iris()
data, target, target_names = iris_dataset["data"], iris_dataset[
    "target"], iris_dataset["target_names"]

# Instantiate model
model = GradientBoostingClassifier()

# Training and validation split
np.random.shuffle(data), np.random.shuffle(target)
train_x, train_y = data[:100], target[:100]
val_x, val_y = data[100:], target[100:]

# Train and evaluate models
model.fit(train_x, train_y)
print("MSE:", mean_squared_error(model.predict(val_x), val_y))

# Save the model and label to file
with open("/tmp/iris_model_logistic_regression.pkl", "wb") as f:
    pickle.dump(model, f)
with open("/tmp/iris_labels.json", "w") as f:
    json.dump(target_names.tolist(), f)
# __doc_train_model_end__


# __doc_define_servable_begin__
class BoostingModel:
    def __init__(self):
        with open("/tmp/iris_model_logistic_regression.pkl", "rb") as f:
            self.model = pickle.load(f)
        with open("/tmp/iris_labels.json") as f:
Exemplo n.º 49
0
# training
print(train_data.shape)
skf = StratifiedKFold(n_splits=5)
eval_result = np.zeros((train_data.shape[0], 1))
predict_label = np.zeros((predict_data.shape[0], 2))
i = 0
for train_index, eval_index in skf.split(train_data, train_label):
    print('start ', i)
    i += 1
    split_train, split_train_label = train_data.iloc[
        train_index], train_label.iloc[train_index]
    eval_data, eval_label = train_data.iloc[eval_index], train_label.iloc[
        eval_index]
    classifier = GradientBoostingClassifier(n_estimators=500)
    classifier.fit(split_train, split_train_label)
    eval_result[eval_index] = classifier.predict(eval_data).reshape(
        eval_data.shape[0], 1)
    predict_label += classifier.predict_proba(predict_data).reshape(
        predict_data.shape[0], 2)

# test accuracy
print('ac ', accuracy_score(train_label, eval_result))
print('precision ', precision_score(train_label, eval_result))
print('recall ', recall_score(train_label, eval_result))
print('f1_score ', f1_score(train_label, eval_result))

# predict
predict_label = np.argmax(predict_label, axis=1)
predict_label = pd.DataFrame(predict_label, columns=['income'])
predict_label = pd.DataFrame(predict_label['income'].map(
    lambda item: '>50K' if item == 1.0 else '<=50K'),
                             columns=['income'])
print "start fitting"
clf2.fit(feat, la)
print "saving model"
# from sklearn.externals import joblib
# joblib.dump(clf2, '../gbdt_feat/model/gbdt.model')
# jbdt = joblib.load('../gbdt_feat/model/gbdt.model')
# for m in range(len(model.feature_importances_)):
#     if model.feature_importances_[m]>0.05:
#         print "feature_importance",m,model.feature_importances_[m]
print "loading test data"
test = np.loadtxt(open("../gbdt_feat/gbdt2_3_class_feat_online.csv", "rb"),
                  delimiter=",",
                  skiprows=0)
import numpy as np
print "predicting"
pre = clf2.predict(test)

f = open("../submit/gbdt2_3_result_classifer_online.csv", "wb")
write = csv.writer(f)
write.writerow(["passengercount", "WIFIAPTag", "slice10min"])
for i in range(len(pre)):
    pre_date = "2016-09-25-"
    wifiname = wifi_name_dict[int(test[i][1])]
    slice10h = 15 + int(test[i][2] - 1) / 6
    slice10m = int((test[i][2] - 1) % 6)
    pre_data = pre_date + str(slice10h) + "-" + str(slice10m)
    write.writerow([str(pre[i]), wifiname, pre_data])
f.close()
for i in range(len(clf2.feature_importances_)):
    print clf2.feature_importances_[i]
X = train[predictors]

scaler = StandardScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X, columns=predictors)

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)

gbm0 = GradientBoostingClassifier(random_state=10)

#Fit the algorithm on the data
gbm0.fit(train_X, train_y)

#Predict training set:
train_predictions = gbm0.predict(val_X)
train_predprob = gbm0.predict_proba(val_X)[:, 1]

#Perform cross-validation:
cv_score = cross_validation.cross_val_score(gbm0,
                                            train_X,
                                            train_y,
                                            cv=5,
                                            scoring='roc_auc')

#Print model report:
print("\nModel Report")
print("Accuracy : %.4g" %
      metrics.accuracy_score(val_y.values, train_predictions))
print("AUC Score (Train): %f" % metrics.roc_auc_score(val_y, train_predprob))
Exemplo n.º 52
0
# In[ ]:

my_model = XGBClassifier(n_estimators=150, learning_rate=0.25)
my_model.fit(df.values, pred, verbose=True)

# In[ ]:

predictions = my_model.predict(test.values)
col = pd.Series(predictions)
final_df = pd.DataFrame({"PassengerId": c1, "Survived": col})
final_df.to_csv("XGBSub.csv", index=False)

# In[ ]:

final_df.sample(19)

# In[ ]:

gbc = GradientBoostingClassifier(n_estimators=150,
                                 learning_rate=1,
                                 max_depth=3,
                                 random_state=0).fit(df.values, pred)
x = gbc.predict(test.values)
c4 = pd.Series(list(x))
final_df = pd.DataFrame({"PassengerId": c1, "Survived": c4})
final_df.to_csv("GBMSub.csv", index=False)

# In[ ]:

# In[ ]:
vec = DictVectorizer()
X = vec.fit_transform(X).toarray()

import random
random.seed(1)

#Splitting the data for training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, random_state = 1234)

from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier()
model.fit(X_train, y_train)

predicted= model.predict(X_test)
print ("Model accuracy is %.2f" % accuracy_score(predicted, y_test))

location =r"C:\Users\Latoya Clarke\Desktop\Data for Analysis\Loan Prediction\test.csv"
loan_test = pd.read_csv(location)

loan_test['Gender'] = loan_test['Gender'].fillna('Male')
loan_test['Married'] = loan_test['Married'].fillna('Yes')
loan_test['Dependents'] = loan_test['Dependents'].fillna(0)
loan_test['Self_Employed'] = loan_test['Self_Employed'].fillna('No')
loan_test['LoanAmount'] = loan_test['LoanAmount'].fillna(round(loan_test['LoanAmount'].mean(),1))
loan_test['Loan_Amount_Term'] = loan_test['Loan_Amount_Term'].fillna(round(loan_test['Loan_Amount_Term'].mean(),1))
loan_test['Credit_History'] = loan_test['Credit_History'].fillna(round(loan_test['Credit_History'].mean(),0))

loan_selected_1 = loan_test.drop(['Loan_ID'], axis = 1)
X_1= loan_selected_1.to_dict(orient='records')
Exemplo n.º 54
0
#Ensemble Classifier

from sklearn.ensemble import VotingClassifier
# estimators=[('gnb', gnb), ('rf', rf), ('log_reg', logreg),('decesiontree',dt),('gradientBoost',gb_clf),('gaussian',gpc)]
estimators = [('decesiontree', dt), ('gradientBoost', gb_clf),
              ('gaussian', gpc)]
ensemble = VotingClassifier(estimators, voting='hard')
ensemble.fit(X_train[:100], y_train[:100])
print('Ensemble: ' + str(ensemble.score(X_test, y_test)) + "\n")

# y_pred_class1 = gnb.predict(X_test)
# y_pred_class2= rf.predict(X_test)
# y_pred_class3 = logreg.predict(X_test)
y_pred_class4 = dt.predict(X_test)
y_pred_class5 = gb_clf.predict(X_test)
y_pred_class6 = gpc.predict(X_test)

y_test_le = le.fit_transform(y_test)
# y_pred_class1_le = le.fit_transform(y_pred_class1)
# y_pred_class2_le = le.fit_transform(y_pred_class2)
# y_pred_class3_le = le.fit_transform(y_pred_class3)
y_pred_class4_le = le.fit_transform(y_pred_class4)
y_pred_class5_le = le.fit_transform(y_pred_class5)
y_pred_class6_le = le.fit_transform(y_pred_class6)

#GNB

class1_tp = 0
class1_fn = 0
class1_fp = 0
Exemplo n.º 55
0
    for fold in fold_stances:
        ids = list(range(len(folds)))
        del ids[fold]

        X_train = np.vstack(tuple([Xs[i] for i in ids]))
        y_train = np.hstack(tuple([ys[i] for i in ids]))

        X_test = Xs[fold]
        y_test = ys[fold]

        clf = GradientBoostingClassifier(n_estimators=200,
                                         random_state=14128,
                                         verbose=True)
        clf.fit(X_train, y_train)

        predicted = [LABELS[int(a)] for a in clf.predict(X_test)]
        actual = [LABELS[int(a)] for a in y_test]

        fold_score, _ = score_submission(actual, predicted)
        max_fold_score, _ = score_submission(actual, actual)

        score = fold_score / max_fold_score

        print("Score for fold " + str(fold) + " was - " + str(score))
        if score > best_score:
            best_score = score
            best_fold = clf

    #Final result: test_data is a dataframe
    # test_data.to_csv('answer.csv', index=False, encoding='utf-8') # From pandas library
Exemplo n.º 56
0
# Each learner aims to reduce the residuals (errors) produced by the previous learner.
# The two main hyper-parameters are:
#
# - The **learning rate** (*lr*) controls over-fitting:
#   decreasing the *lr* limits the capacity of a learner to overfit the residuals, ie,
#   it slows down the learning speed and thus increases the **regularisation**.
#
# - The **sub-sampling fraction** controls the fraction of samples to be used for
#   fitting the learners. Values smaller than 1 leads to **Stochastic Gradient Boosting**.
#   It thus controls for over-fitting reducing variance and incresing bias.
#
# .. figure:: ../images/gradient_boosting.png
#    :width: 500
#    :alt: Gradient boosting.
#

from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(n_estimators=100,
                                learning_rate=0.1,
                                subsample=0.5,
                                random_state=0)
gb.fit(X_train, y_train)

y_pred = gb.predict(X_test)
y_prob = gb.predict_proba(X_test)[:, 1]

print("bAcc: %.2f, AUC: %.2f " %
      (metrics.balanced_accuracy_score(y_true=y_test, y_pred=y_pred),
       metrics.roc_auc_score(y_true=y_test, y_score=y_prob)))
Exemplo n.º 57
0
		if record[10] != 20 or age_of20 < 25000:
			training.append(list(record[0:10]))
			label_of_training.append(record[10])
	counter = counter + 1


starttime = time.time()
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
clf = clf.fit(training, label_of_training)
endtime = time.time()
TP = 0 ### correctly predicted cases whose age are below 40
FN = 0 ### cases mis-predicted age > 40
FP = 0 ### cases mis-predicted age < 40
TN = 0 ### correctly predicted cases whose age > 40

count = 0
for elem in clf.predict(testing):
	if label_of_testing[count] <= 40 and elem <= 40:
		TP = TP + 1
	elif label_of_testing[count] <= 40 and elem > 40:
		FN = FN + 1
	elif label_of_testing[count] > 40 and elem <= 40:
		FP = FP + 1
	else:
		TN = TN + 1
	count = count + 1

print("Accuracy Rate: ", (TN + TP)/len(testing))
print("Precision Rate: ", TP/(TP + FP))
print("Recall Rate: ", TP/(TP + FN))
print("Model Construction Time: ", (endtime - starttime), " sec")
Exemplo n.º 58
0
import sys
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier

if __name__ == '__main__':
    X_train = pd.read_csv(sys.argv[1]).values
    y_train = pd.read_csv(sys.argv[2]).values.reshape(-1)
    X_test = pd.read_csv(sys.argv[3]).values
    print("X_train:", X_train.shape, end=' / ')
    print("y_train:", y_train.shape, end=' / ')
    print("X_test:", X_test.shape)
    gbc = GradientBoostingClassifier(n_estimators=700)
    gbc.fit(X_train, y_train)
    y_predict = gbc.predict(X_test)
    print("y_predict:", y_predict, "/ shape:", y_predict.shape)
    data = np.c_[np.arange(len(y_predict)) + 1, y_predict]
    fo = open(sys.argv[4], 'w')
    fo.write(pd.DataFrame(data, columns=['id', 'label']).to_csv(index=False))
    fo.close()
Exemplo n.º 59
0
#grid_search.fit(X_train, Y_train)
#grid_search.best_params_

# Random Forests

random_forest = RandomForestClassifier(n_estimators=100)
#random_forest = RandomForestClassifier(n_estimators=100,
#										 criterion='entropy',
#										 max_depth=10,
#										 max_features='sqrt',
#										 min_samples_split=5)
random_forest.fit(X_train, Y_train)
Y_pred_1 = random_forest.predict(X_test)

#grid_2 = { "loss"          : ["deviance","exponential"],
#               "n_estimators"  : [100],
#              "max_features"      : ['sqrt','log2',0.2,0.5,0.8]}
#GB=GradientBoostingClassifier()
#grid_search = sklearn.model_selection.GridSearchCV(GB, grid_2, n_jobs=-1, cv=5)
#grid_search.fit(X_train, Y_train)
#grid_search.best_params_
random_forest.score(X_train, Y_train)

#gradient_boost = GradientBoostingClassifier(n_estimators=100,loss='exponential',max_features='log2')
gradient_boost = GradientBoostingClassifier(n_estimators=100)
gradient_boost.fit(X_train, Y_train)

Y_pred_2 = gradient_boost.predict(X_test)

gradient_boost.score(X_train, Y_train)
Exemplo n.º 60
0
                                   loss='deviance',
                                   max_depth=50,
                                   max_features=2,
                                   max_leaf_nodes=100,
                                   min_samples_leaf=1,
                                   min_samples_split=2,
                                   min_weight_fraction_leaf=.2,
                                   n_estimators=100,
                                   presort='auto',
                                   random_state=None,
                                   subsample=1.0,
                                   verbose=1,
                                   warm_start=False)
model.fit(x_train, y_train)

res22 = model.predict([x_train[0]])

import RPi.GPIO as GPIO
from time import sleep

GPIO.setwarnings(False)
GPIO.setmode(GPIO.BOARD)
GPIO.setup(3, GPIO.OUT, initial=GPIO.LOW)
GPIO.setup(5, GPIO.OUT, initial=GPIO.LOW)
GPIO.setup(7, GPIO.OUT, initial=GPIO.LOW)

if res22[0] == 0:
    GPIO.output(3, GPIO.HIGH)
elif res22[0] == 1:
    GPIO.output(5, GPIO.HIGH)
else: