def test_sample_weight():
    """Tests sample_weight parameter of VotingClassifier"""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = SVC(probability=True, random_state=123)
    eclf1 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('svc', clf3)],
        voting='soft').fit(X, y, sample_weight=np.ones((len(y),)))
    eclf2 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('svc', clf3)],
        voting='soft').fit(X, y)
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))

    sample_weight = np.random.RandomState(123).uniform(size=(len(y),))
    eclf3 = VotingClassifier(estimators=[('lr', clf1)], voting='soft')
    eclf3.fit(X, y, sample_weight)
    clf1.fit(X, y, sample_weight)
    assert_array_equal(eclf3.predict(X), clf1.predict(X))
    assert_array_equal(eclf3.predict_proba(X), clf1.predict_proba(X))

    clf4 = KNeighborsClassifier()
    eclf3 = VotingClassifier(estimators=[
        ('lr', clf1), ('svc', clf3), ('knn', clf4)],
        voting='soft')
    msg = ('Underlying estimator \'knn\' does not support sample weights.')
    assert_raise_message(ValueError, msg, eclf3.fit, X, y, sample_weight)
def test_predict_on_toy_problem():
    """Manually check predicted class labels for toy dataset."""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = GaussianNB()

    X = np.array([[-1.1, -1.5],
                  [-1.2, -1.4],
                  [-3.4, -2.2],
                  [1.1, 1.2],
                  [2.1, 1.4],
                  [3.1, 2.3]])

    y = np.array([1, 1, 1, 2, 2, 2])

    assert_equal(all(clf1.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))
    assert_equal(all(clf2.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))
    assert_equal(all(clf3.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))

    eclf = VotingClassifier(estimators=[
                            ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
                            voting='hard',
                            weights=[1, 1, 1])
    assert_equal(all(eclf.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))

    eclf = VotingClassifier(estimators=[
                            ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
                            voting='soft',
                            weights=[1, 1, 1])
    assert_equal(all(eclf.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))
Exemplo n.º 3
0
def voting_fit(X, y, RESULT_TEST_PATH,RESULT_PATH):
    ada_best = fit_adaboost(X, y)
    extratree_best = fit_extratree(X, y)
    rf_best = fit_rf(X, y)
    gbdt_best = fit_xgboost(X, y)
    svc_best = fit_svc(X, y)
    lr_best = fit_lr(X, y)

    votingC = VotingClassifier(estimators=[('rfc', rf_best), ('extc', extratree_best),('lr',lr_best),
                                            ('adac', ada_best), ('gbc', gbdt_best)], voting='soft',
                               n_jobs=4)
    votingC.fit(X, y)

    test_df = pd.read_csv(RESULT_TEST_PATH)
    test = np.array(test_df)

    #test_Survived = pd.Series(votingC.predict(test), name="Survived")

    result = votingC.predict(test)
    test_df.insert(test_df.columns.size, 'Survived', result)

    test_df = test_df[['PassengerId', 'Survived']]
    test_df['PassengerId'] = test_df['PassengerId'].apply(np.int64)
    test_df.to_csv(RESULT_PATH, index=False)
    print("finish!")
Exemplo n.º 4
0
def main(directory, tools_directory, non_tools_dir):
    global path
    path = sys.path[0]
    start = time.time()
    if directory is None or not os.path.isdir(directory):
        print "Please input directory containing pdf publications to classify"
        sys.exit(1)
    x_train, y_train = fetch_from_file()
    x_test, test_files = get_test_set(directory)
    # Just for testing, update machine learning part later

    x_train, x_test = normalize_scale(x_train, x_test)
    classifier = VotingClassifier(
        [("first", classifier_list[0]), ("second", classifier_list[1]), ("second", classifier_list[2])]
    )
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    if os.path.isdir(tools_directory):
        shutil.rmtree(tools_directory)
    os.makedirs(tools_directory)

    if os.path.isdir(non_tools_dir):
        shutil.rmtree(non_tools_dir)
    os.makedirs(non_tools_dir)

    for num, pub in zip(y_pred, test_files):
        if num:
            shutil.copy2(directory + pub, tools_directory + pub)
        else:
            shutil.copy2(directory + pub, non_tools_dir + pub)

    print "Classification:    Seconds taken: " + str(time.time() - start)
    def process_cell(self, df_cell_train, df_cell_test, window):

        place_counts = df_cell_train.place_id.value_counts()
        mask = (place_counts[df_cell_train.place_id.values] >= th).values
        df_cell_train = df_cell_train.loc[mask]

        # Working on df_test
        row_ids = df_cell_test.index

        # Preparing data
        le = LabelEncoder()
        y = le.fit_transform(df_cell_train.place_id.values)
        X = df_cell_train.drop(['place_id', ], axis=1).values.astype(int)
        X_test = df_cell_test.values.astype(int)

        # Applying the classifier
        clf1 = KNeighborsClassifier(n_neighbors=50, weights='distance',
                                    metric='manhattan')
        clf2 = RandomForestClassifier(n_estimators=50, n_jobs=-1)
        eclf = VotingClassifier(estimators=[('knn', clf1), ('rf', clf2)], voting='soft')

        eclf.fit(X, y)
        y_pred = eclf.predict_proba(X_test)
        pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:, ::-1][:, :3])
        return pred_labels, row_ids
def test_set_params():
    """set_params should be able to set estimators"""
    clf1 = LogisticRegression(random_state=123, C=1.0)
    clf2 = RandomForestClassifier(random_state=123, max_depth=None)
    clf3 = GaussianNB()
    eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft',
                             weights=[1, 2])
    assert_true('lr' in eclf1.named_estimators)
    assert_true(eclf1.named_estimators.lr is eclf1.estimators[0][1])
    assert_true(eclf1.named_estimators.lr is eclf1.named_estimators['lr'])
    eclf1.fit(X, y)
    assert_true('lr' in eclf1.named_estimators_)
    assert_true(eclf1.named_estimators_.lr is eclf1.estimators_[0])
    assert_true(eclf1.named_estimators_.lr is eclf1.named_estimators_['lr'])

    eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft',
                             weights=[1, 2])
    eclf2.set_params(nb=clf2).fit(X, y)
    assert_false(hasattr(eclf2, 'nb'))

    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
    assert_equal(eclf2.estimators[0][1].get_params(), clf1.get_params())
    assert_equal(eclf2.estimators[1][1].get_params(), clf2.get_params())

    eclf1.set_params(lr__C=10.0)
    eclf2.set_params(nb__max_depth=5)

    assert_true(eclf1.estimators[0][1].get_params()['C'] == 10.0)
    assert_true(eclf2.estimators[1][1].get_params()['max_depth'] == 5)
    assert_equal(eclf1.get_params()["lr__C"],
                 eclf1.get_params()["lr"].get_params()['C'])
def test_estimator_weights_format():
    # Test estimator weights inputs as list and array
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    eclf1 = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2)], weights=[1, 2], voting="soft")
    eclf2 = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2)], weights=np.array((1, 2)), voting="soft")
    eclf1.fit(X, y)
    eclf2.fit(X, y)
    assert_array_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
Exemplo n.º 8
0
def process_one_cell(df_train, df_test, x_min, x_max, y_min, y_max):

    x_border_augment = 0.025
    y_border_augment = 0.0125

    #Working on df_train
    df_cell_train = df_train[(df_train['x'] >= x_min-x_border_augment) & (df_train['x'] < x_max+x_border_augment) &
                               (df_train['y'] >= y_min-y_border_augment) & (df_train['y'] < y_max+y_border_augment)]
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= th).values
    df_cell_train = df_cell_train.loc[mask]

    #Working on df_test
    # to be delete: df_cell_test = df_test.loc[df_test.grid_cell == grid_id]
    df_cell_test = df_test[(df_test['x'] >= x_min) & (df_test['x'] < x_max) &
                               (df_test['y'] >= y_min) & (df_test['y'] < y_max)]
    row_ids = df_cell_test.index

    if(len(df_cell_train) == 0 or len(df_cell_test) == 0):
        return None, None

    #Feature engineering on x and y
    df_cell_train.loc[:,'x'] *= fw[0]
    df_cell_train.loc[:,'y'] *= fw[1]
    df_cell_test.loc[:,'x'] *= fw[0]
    df_cell_test.loc[:,'y'] *= fw[1]

    #Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id'], axis=1).values.astype(float)

    if 'place_id' in df_cell_test.columns:

        cols = df_cell_test.columns
        cols = cols.drop('place_id')

        X_test = df_cell_test[cols].values.astype(float)

    else:

        X_test = df_cell_test.values.astype(float)

    #Applying the classifier
    # clf = KNeighborsClassifier(n_neighbors=26, weights='distance',
    #                            metric='manhattan')
    clf1 = BaggingClassifier(KNeighborsClassifier(n_neighbors=26, weights='distance',
                                metric='manhattan'), n_jobs=-1, n_estimators=50)
    clf2 = RandomForestClassifier(n_estimators=100, n_jobs=-1)

    eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)], voting='hard')

    eclf.fit(X, y)
    y_pred = eclf.predict_proba(X_test)
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3])

    return pred_labels, row_ids
Exemplo n.º 9
0
def classify():
    train_X,Y = load_svmlight_file('data/train_last')
    test_X,test_Y = load_svmlight_file('data/test_last')
    train_X = train_X.toarray()
    test_X = test_X.toarray()
    Y = [int(y) for y in Y]
    # print 'Y:',len(Y)
    rows = pd.read_csv('data/log_test2.csv',index_col=0).sort_index().index.unique()
    train_n = train_X.shape[0]
    m = train_X.shape[1]
    test_n = test_X.shape[0]
    print train_n,m,#test_n
     # 先用训练集训练出所有的分类器
    print 'train classify...'
    clf1 = LinearDiscriminantAnalysis()
    clf2 = GaussianNB()
    clf3 = LogisticRegression()
    clf4 = RandomForestClassifier()
    clf5 = KNeighborsClassifier(n_neighbors=12)
    clf6 = AdaBoostClassifier()
    # x_train,x_test,y_train,y_test = train_test_split(train_X,Y,test_size=0.2) # 对训练集进行划分

    # print x_train.shape
    # print x_test.shape
    # clf.fit(train_X,Y)
    clf = VotingClassifier(estimators=[('la',clf1),('nb',clf2),('lr',clf3),('rf',clf4),('nn',clf5),('ac',clf6)], voting='soft', weights=[1.5,1,1,1,1,1])
    # clf1.fit(x_train,y_train)
    # clf2.fit(x_train,y_train)
    # clf3.fit(x_train,y_train)
    # clf4.fit(x_train,y_train)
    clf.fit(train_X,Y)
    print 'end train classify'

    print 'start classify....'
    # print metrics.classification_report(Y,predict_Y)
    # clf2.fit(train_X,Y)
    # print 'clf2 fited...'
    # clf3.fit(train_X,Y)
    # print 'clf3 fited...'
    # clf4.fit(train_X,Y)
    # print 'clf4 fited...'
    # clf1.fit(train_X,Y)
    # print 'clf1 fited...'
    # 第一个分类结果
    predict_Y = clf.predict(train_X)
    # predict_Y = clf.predict(train_X)
    print 'classify result:'
    print metrics.classification_report(Y,predict_Y)

    predict_Y = clf.predict(test_X)
    # print predict_Y,len(predict_Y)
    print 'end classify...'
    # predict_Y = clf.predict(X[cnt_train:]) # 训练注释这一行,输出测试集打开这一行,注释之后的print metric
    # predict_Y = clf.predict(test_X) # 训练注释这一行,输出测试集打开这一行,注释之后的print metric
    DataFrame(predict_Y,index=rows).to_csv('data/info_test2.csv', header=False)
def test_multilabel():
    """Check if error is raised for multilabel classification."""
    X, y = make_multilabel_classification(n_classes=2, n_labels=1, allow_unlabeled=False, random_state=123)
    clf = OneVsRestClassifier(SVC(kernel="linear"))

    eclf = VotingClassifier(estimators=[("ovr", clf)], voting="hard")

    try:
        eclf.fit(X, y)
    except NotImplementedError:
        return
Exemplo n.º 11
0
def test_predict_for_hard_voting():
    # Test voting classifier with non-integer (float) prediction
    clf1 = FaultySVC(random_state=123)
    clf2 = GaussianNB()
    clf3 = SVC(probability=True, random_state=123)
    eclf1 = VotingClassifier(estimators=[
        ('fsvc', clf1), ('gnb', clf2), ('svc', clf3)], weights=[1, 2, 3],
        voting='hard')

    eclf1.fit(X, y)
    eclf1.predict(X)
Exemplo n.º 12
0
    def train(self):
        for bin_id in sorted(self.xy_bins):
            file_name = xybins_file_name_str.format(bin_id)
            print 'Training model: {} of {}'.format(bin_id, max(self.xy_bins))
            df = self.df
            wdf = df[df.xy_bin == bin_id]
            X = wdf[self.features]
            y = wdf.place_id

            model = VotingClassifier(self.models)
            model.fit(X, y)
            joblib.dump(model, file_name, compress=3, )
Exemplo n.º 13
0
def test_sample_weight_kwargs():
    """Check that VotingClassifier passes sample_weight as kwargs"""
    class MockClassifier(BaseEstimator, ClassifierMixin):
        """Mock Classifier to check that sample_weight is received as kwargs"""
        def fit(self, X, y, *args, **sample_weight):
            assert_true('sample_weight' in sample_weight)

    clf = MockClassifier()
    eclf = VotingClassifier(estimators=[('mock', clf)], voting='soft')

    # Should not raise an error.
    eclf.fit(X, y, sample_weight=np.ones((len(y),)))
Exemplo n.º 14
0
def main(path,filename):

	batchsT = ['histogramaByN','histogramaColor','patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_5','patronesCirculaesByN_2_9','patronesCirculaesByN_3_9','patronesCirculaesByN_5_9','patronesCirculaesByN_3_5']
	batchsAux = ['histogramaByN','histogramaColor','patronesCirculaesByN_2_5','patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_9','patronesCirculaesByN_3_9','patronesCirculaesByN_5_9','patronesCirculaesByN_3_5','patronesCirculaesByN_6_12','patronesCirculaesByN_8_12']
	#batchs = ['patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_5','patronesCirculaesByN_2_9']
	#batchs = ['patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_5','patronesCirculaesByN_3_5']
	#for batch in batchsAux:


	#print batch
	batchs = batchsAux
	#batchs.remove(batch)
	X = []
	y = []
	load_batch(y,path,'clases',filename) 
	y = [j for i in y for j in i]
	for batch in batchs:
		load_batch(X,path,batch,filename)
	
	#X,y = load_images('/tmp/train/')
	est = [RandomForest(),Boosting()]
	for i in xrange(0,15):
		est.append(Gradient(i))
	for i in xrange(0,4):
		est.append(SVM(i))

	#scores = cross_validation.cross_val_score(clf, X, y, cv=5)
	#print scores
	clf = VotingClassifier(estimators=est)

	clf.fit(X,y)
	pickle.dump( clf, open( "clf_grande.p", "wb" ) )
	return
	X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, y, test_size=0.2,random_state=777)
	#print clf.sub_score(X_test,Y_test)
	print 'start'
	conf_matrix = metrics.confusion_matrix(Y_test,clf.predict(X_test))
	print 'confution matrix'
	print conf_matrix
	return
	for name,estim in est:
		print name
		#estim.fit(X_train,Y_train)
		#print estim.score(X_test,Y_test)
		print cross_validation.cross_val_score(estim, X, y, cv=5,n_jobs=-1)
	print 'voter'
	print cross_validation.cross_val_score(clf, X, y, cv=5,n_jobs=-1)
	return
	#clf.fit(X_train,Y_train)
	print clf.score(X_test,Y_test)

	return
Exemplo n.º 15
0
Arquivo: util.py Projeto: pvigier/sa
def train_classifier(algorithm, features, train):
    print('Train classifier ({})...'.format(algorithm))
    estimators = []
    if 'rf' in algorithm:
        estimators.append(('rf', RandomForestClassifier(n_estimators=100)))
    if 'lr' in algorithm:
        estimators.append(('lr', LogisticRegression()))
    if 'mb' in algorithm:
        estimators.append(('mb', MultinomialNB()))
    # Training
    classifier = VotingClassifier(estimators=estimators, voting='soft')
    classifier.fit(features, train['sentiment'])
    return classifier
Exemplo n.º 16
0
def voting_class(X,training_target,Y):
    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import VotingClassifier
    
    clf1 = LogisticRegression(random_state=1)
    clf2 = RandomForestClassifier(random_state=1)
    clf3 = GaussianNB()
    eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft')
    eclf.fit(X[:,0:6],training_target)
    proba = eclf.predict_proba(Y[:,0:6])
    
    eclf.predict()
def test_predict_proba_on_toy_problem():
    """Calculate predicted probabilities on toy dataset."""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = GaussianNB()
    X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
    y = np.array([1, 1, 2, 2])

    clf1_res = np.array([[0.59790391, 0.40209609],
                         [0.57622162, 0.42377838],
                         [0.50728456, 0.49271544],
                         [0.40241774, 0.59758226]])

    clf2_res = np.array([[0.8, 0.2],
                         [0.8, 0.2],
                         [0.2, 0.8],
                         [0.3, 0.7]])

    clf3_res = np.array([[0.9985082, 0.0014918],
                         [0.99845843, 0.00154157],
                         [0., 1.],
                         [0., 1.]])

    t00 = (2*clf1_res[0][0] + clf2_res[0][0] + clf3_res[0][0]) / 4
    t11 = (2*clf1_res[1][1] + clf2_res[1][1] + clf3_res[1][1]) / 4
    t21 = (2*clf1_res[2][1] + clf2_res[2][1] + clf3_res[2][1]) / 4
    t31 = (2*clf1_res[3][1] + clf2_res[3][1] + clf3_res[3][1]) / 4

    eclf = VotingClassifier(estimators=[
                            ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
                            voting='soft',
                            weights=[2, 1, 1])
    eclf_res = eclf.fit(X, y).predict_proba(X)

    assert_almost_equal(t00, eclf_res[0][0], decimal=1)
    assert_almost_equal(t11, eclf_res[1][1], decimal=1)
    assert_almost_equal(t21, eclf_res[2][1], decimal=1)
    assert_almost_equal(t31, eclf_res[3][1], decimal=1)

    try:
        eclf = VotingClassifier(estimators=[
                                ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
                                voting='hard')
        eclf.fit(X, y).predict_proba(X)

    except AttributeError:
        pass
    else:
        raise AssertionError('AttributeError for voting == "hard"'
                             ' and with predict_proba not raised')
Exemplo n.º 18
0
def run_voting(training_set, train_set_labels, validation_set, validation_set_labels):
    from sklearn.ensemble import VotingClassifier
    standard_train_inputs = standard_data(training_set)
    standard_valid_inputs = standard_data(validation_set)
    kknn_class = KNeighborsClassifier(weights='uniform', n_neighbors=5)

    logistic_regression_solver = sklearn.linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.01, C=1.0, fit_intercept=True,
                                                                         intercept_scaling=1, class_weight=None, random_state=None, solver='newton-cg',
                                                                         max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=2)
    svm_class = svm.SVC(decision_function_shape='ovo', tol=0.001)
    eclf1 = VotingClassifier(estimators=[('knn', kknn_class), ('lr', logistic_regression_solver), ('svm', svm_class)], voting='hard')
    eclf1.fit(standard_train_inputs,train_set_labels.ravel())

    accuracy = eclf1.score(standard_valid_inputs,validation_set_labels.ravel())
    print accuracy
Exemplo n.º 19
0
def acc_VotingClassifier():
    kf = KFold(900, n_folds=10,shuffle=True)
    acc = 0.0
    temp = 1
    conf_mat = [[0 for i in range(10)] for j in range(10)]
    clf1 = GaussianNB()
    clf2 = RandomForestClassifier(n_estimators=20,max_features=None,class_weight="balanced_subsample")
    clf3 = SVC(kernel='rbf', probability=False)
    clf4 = LogisticRegression()
    eclf = VotingClassifier(estimators=[('gnb', clf1), ('rf', clf2),  ('lr', clf4)], voting='hard', weights=[1,3,3])
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        eclf = eclf.fit(X_train, y_train)
        y_predict = eclf.predict(X_test)
        acc_loop = getAccuracy(y_predict,y_test)
        conf_mat = buildConfusionMatrix(conf_mat,y_predict,y_test)
        print("*** Accuracy*** for "+str(temp)+"th time: "+str(acc_loop))
        acc += acc_loop
        temp +=1
    # Checking if the data set is transformed into MFCC(13) or FFT(1000) or KPCA features(else)
    if (X.shape[1]==13):
        print 'In 13 features if'
        valid_mfcc = eclf.predict(validation_set_mfcc)
    elif (X.shape[1]==1000):
        print 'In 1000 features elif'
        valid_fft = eclf.predict(validation_set_fft)
    elif (X.shape[1]==100):
        print 'In KPCA features else'
        valid_kpca = eclf.predict(validation_set_kpca)
    acc = (acc/10.0)
    printConfusionMatrix(conf_mat)
    return acc, getAccuracyFromConfusion(conf_mat),valid_mfcc, valid_fft, valid_kpca
Exemplo n.º 20
0
def do_ml(ticker):
    X, y, df = extract_featuresets(ticker)
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.25)


    #clf = neighbors.KNeighborsClassifier()
    clf = VotingClassifier([('lsvc', svm.LinearSVC()),
                            ('knn', neighbors.KNeighborsClassifier()),
                            ('rfor', RandomForestClassifier())] )

    clf.fit(X_train, y_train)
    confidence = clf.score(X_test, y_test)
    print('Accuracy', confidence)
    predictions = clf.predict(X_test)
    print('Predicted spread:', Counter(predictions))

    return confidence
def test_tie_situation():
    """Check voting classifier selects smaller class label in tie situation."""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    eclf = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2)], voting="hard")
    assert_equal(clf1.fit(X, y).predict(X)[73], 2)
    assert_equal(clf2.fit(X, y).predict(X)[73], 1)
    assert_equal(eclf.fit(X, y).predict(X)[73], 1)
Exemplo n.º 22
0
def test_tie_situation():
    """Check voting classifier selects smaller class label in tie situation."""
    clf1 = LogisticRegression(random_state=123, solver='liblinear')
    clf2 = RandomForestClassifier(random_state=123)
    eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)],
                            voting='hard')
    assert_equal(clf1.fit(X, y).predict(X)[73], 2)
    assert_equal(clf2.fit(X, y).predict(X)[73], 1)
    assert_equal(eclf.fit(X, y).predict(X)[73], 1)
Exemplo n.º 23
0
def test_sample_weight():
    """Tests sample_weight parameter of VotingClassifier"""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = SVC(probability=True, random_state=123)
    eclf1 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('svc', clf3)],
        voting='soft').fit(X, y, sample_weight=np.ones((len(y),)))
    eclf2 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('svc', clf3)],
        voting='soft').fit(X, y)
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))

    sample_weight = np.random.RandomState(123).uniform(size=(len(y),))
    eclf3 = VotingClassifier(estimators=[('lr', clf1)], voting='soft')
    eclf3.fit(X, y, sample_weight)
    clf1.fit(X, y, sample_weight)
    assert_array_equal(eclf3.predict(X), clf1.predict(X))
    assert_array_almost_equal(eclf3.predict_proba(X), clf1.predict_proba(X))

    # check that an error is raised and indicative if sample_weight is not
    # supported.
    clf4 = KNeighborsClassifier()
    eclf3 = VotingClassifier(estimators=[
        ('lr', clf1), ('svc', clf3), ('knn', clf4)],
        voting='soft')
    msg = ('Underlying estimator KNeighborsClassifier does not support '
           'sample weights.')
    with pytest.raises(ValueError, match=msg):
        eclf3.fit(X, y, sample_weight)

    # check that _parallel_fit_estimator will raise the right error
    # it should raise the original error if this is not linked to sample_weight
    class ClassifierErrorFit(BaseEstimator, ClassifierMixin):
        def fit(self, X, y, sample_weight):
            raise TypeError('Error unrelated to sample_weight.')
    clf = ClassifierErrorFit()
    with pytest.raises(TypeError, match='Error unrelated to sample_weight'):
        clf.fit(X, y, sample_weight=sample_weight)
Exemplo n.º 24
0
class VtClassifier(Model):
    '''
    Voting Classfier
    '''

    def __init__(self, *args):
        Model.__init__(self)
        self.modelIndex = ['GNB', 'SVClassifier', 'LRModel', 'ABClassifier', 'GBClassifier']
        self.models = []
        self.estimators = []
        for arg in args:
            index = self.modelIndex.index(arg)
            if index == 0:
                self.models.append(Model())
                self.estimators.append((arg, Model().model))
            elif index == 1:
                self.models.append(SVClassifier())
                self.estimators.append((arg, SVClassifier().model))
            elif index == 2:
                self.models.append(LRModel())
                self.estimators.append((arg, LRModel().model))
            elif index == 3:
                self.models.append(ABClassifier())
                self.estimators.append((arg, ABClassifier().model))
            elif index == 4:
                self.models.append(GBClassifier())
                self.estimators.append((arg, GBClassifier().model))
        self.model = VotingClassifier(estimators=self.estimators, voting='hard')

    def train(self, data, target):
        for model in self.models:
            model.train(data, target)
        self.model.fit(data, target)

    def predict(self, test):
        return self.model.predict_proba(test)
Exemplo n.º 25
0
def buildVoting( features, label, params, verbose=False ):
	''' git description
+ __buildVoting__( features, label, params, verbose=False ) :
    + _does_ : Fits a voting classifier aggregating a RF a SVM and a KNN classifiers, on ("features", "label") data
    + _returns_ : Fitted model (as _?_, has to be a _sklearn_ classifier though)
    + _called by_ : __buildModel__
    + _calls_ : __sklearn.ensemble.RandomForestClassifier__, __sklearn.svm.SVC__, __sklearn.neighbors.KNeighborsClassifier__, __sklearn.ensemble.VotingClassifier__
    + _arguments_ :
        
| type | name | description |
| --- | --- | --- |
| _list_ | features | List of train features to fit the model |
| _list_ of _int_ | label | List of associated labels |
| _list_ | params | List of model parameters [n_estimators, n_neighors, kernel] |
| _boolean_ | verbose | Controls console outputs |
	'''
	
	from sklearn.ensemble import RandomForestClassifier, VotingClassifier
	from sklearn.svm import SVC
	from sklearn.neighbors import KNeighborsClassifier
	
	clf1 = RandomForestClassifier( n_estimators = params[0] )
	clf2 = KNeighborsClassifier( n_neighbors = params[1] )
	clf3 = SVC( kernel = params[2], probability = True )
	
	t= time()
	if verbose: print( "Training a voting classifier from following models : \n" + \
				"  - Random Forest (" + str(params[0]) +" estimators) - weight = 2 \n" + \
				"  - " + str(params[1]) + "-Nearest Neighbors - weight = 1 \n" + \
				"  - SVM ('" + str(params[2]) + "' kernel) - weight = 2 \n\n" + \
				"Please wait...\n" )
	agg_clf = VotingClassifier( estimators=[ ('rf', clf1), ('knn', clf2), ('svm', clf3) ], voting='soft', weights=[2,1,2] )
	agg_clf.fit( features, label )
	
	if verbose: print( "Completed in " + str( time()-t ) + " seconds.\n" )
	return agg_clf
Exemplo n.º 26
0
def train_assembling_average(categories, comments, badwords):
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.pipeline import Pipeline
    from sklearn.linear_model import SGDClassifier
    from sklearn.ensemble import VotingClassifier

    text_clf = Pipeline([('vect', TfidfVectorizer(lowercase=True, ngram_range=(1, 3), analyzer="word", min_df=3)),
                      ('clf', SGDClassifier(loss='log', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))])

    custom = CustomTransformer(badwords)
    clf = Pipeline([('vect', custom),
                    ('clf', SGDClassifier(loss='log', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))])

    final_classifier = VotingClassifier(estimators=[('text', text_clf), ('custom', clf)],
                                        voting='soft', weights=[3,1])
    final_classifier = final_classifier.fit(comments, categories)
    return final_classifier
Exemplo n.º 27
0
def combine_voting_NB_classifier(X_train, X_test, y_train, y_test,X_train_meta, X_test_meta, y_train_meta, y_test_meta):
    from sklearn.naive_bayes import BernoulliNB
    from sklearn.neighbors import NearestCentroid
    from sklearn.ensemble import VotingClassifier

    clf_1 = BernoulliNB(alpha = 0.10000000000000001).fit(X_train_meta, y_train_meta)
    from sklearn.svm import SVC
    clf_2 = SVC(C=100, gamma=0.1).fit(X_train_meta, y_train_meta)
    clf_3 = NearestCentroid().fit(X_train_meta, y_train_meta)

    eclf = VotingClassifier(estimators=[('nb1', clf_1),('nb2', clf_3)], voting='hard')

    eclf = eclf.fit(X_train_meta, y_train_meta)
    y_voting_predicted = eclf.predict(X_test_meta)

    np.savetxt('oto_wyniki.csv',y_voting_predicted, delimiter=',')
    print "\n Here is the classification report for Voting classifier:"
    print metrics.classification_report(y_test_meta, y_voting_predicted)
Exemplo n.º 28
0
def all_classifer(X_train,y_train,X_test,y_test):
    rf=RandomForestClassifier(n_estimators=100,class_weight ='balanced') 
    score1=scores(y_test,rf.fit(X_train,y_train).predict(X_test),rf.predict_proba(X_test)[:,1],'RT')
    gbc = GradientBoostingClassifier(n_estimators=50,learning_rate=0.05).fit(X_train,y_train)
    score2=scores(y_test,gbc.fit(X_train,y_train).predict(X_test),gbc.predict_proba(X_test)[:,1],'gbc') 
    ets=ExtraTreesClassifier(n_estimators=100,max_depth=None,min_samples_split=1,random_state=0)
    score3=scores(y_test,ets.fit(X_train,y_train).predict(X_test),ets.predict_proba(X_test)[:,1],'ets') 
#    lgr = LogisticRegression()
#    score4=scores(y_test,lgr.fit(X_train,y_train).predict(X_test),'lgr') 
    ab = AdaBoostClassifier(algorithm='SAMME.R',n_estimators=50,learning_rate=0.7)
    score5=scores(y_test,ab.fit(X_train,y_train).predict(X_test),ab.predict_proba(X_test)[:,1],'abboost') 
#    print roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
#    bagging=BaggingClassifier()
#    score8=scores(y_test,bagging.fit(X_train,y_train).predict(X_test),'bagging')    
    
#    dt = DecisionTreeClassifier(max_depth=None, min_samples_split=1,random_state=0)
#    score6=scores(y_test,dt.fit(X_train,y_train).predict(X_test),'dt') 
    eclf = VotingClassifier(estimators=[ ('rf', rf), 
                                        ('gd',gbc),('ETs',ets),('ab',ab)],
                                         voting='soft',weights =[score1[0],score2[0],score3[0],score5[0]])
    score7=scores(y_test,eclf.fit(X_train,y_train).predict(X_test),eclf.predict_proba(X_test)[:,1],'voting') 
    print eclf
    return [score1,score2,score3,score5,score7]
Exemplo n.º 29
0
def mutipleClf(label_clfset,data,features,votingType='soft',weight=[],testData=None,testFeatures=None):
    flag=False
    if weight==[]:
        flag=True;
    print "======================================\n"
    print ("Start at: "+time.strftime("%H:%M:%S")+"\n")
    if votingType=='soft':  
        for label_clf in label_clfset:
            #use ten fold socore,set the cv to 10
            scores = cross_validation.cross_val_score(label_clf[1], data, features, cv=10)
            if flag:
                weight.append(scores.mean())
        eclf = VotingClassifier(estimators=label_clfset, voting=votingType, weights=weight)
        
    else:
        eclf = VotingClassifier(estimators=label_clfset, voting=votingType)
    result=eclf.fit(data,features)
    accuracy=0.0
    if testData!=None:
        testResult=eclf.predict(testData)
        accuracy=getAccuracy(testResult,testFeatures)   
    print ("End at: "+time.strftime("%H:%M:%S")+"\n")
    print "======================================\n"
    return result,accuracy
Exemplo n.º 30
0
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Evaluate clf's accuracy on the test set
    print('{:s} : {:.3f}'.format(clf_name, accuracy))

# ---------------------------------------------------------
# Import VotingClassifier from sklearn.ensemble
from sklearn.ensemble import VotingClassifier

# Instantiate a VotingClassifier vc
vc = VotingClassifier(estimators=classifiers)

# Fit vc to the training set
vc.fit(X_train, y_train)

# Evaluate the test set predictions
y_pred = vc.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Voting Classifier: {:.3f}'.format(accuracy))

# ---------------------------------------------------------
# Import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

# Import BaggingClassifier
from sklearn.ensemble import BaggingClassifier
Exemplo n.º 31
0
                                                    Y,
                                                    test_size=0.20,
                                                    random_state=44)

# group / ensemble of models
estimator = []
estimator.append(('LR',
                  LogisticRegression(solver='lbfgs',
                                     multi_class='multinomial',
                                     max_iter=200)))
estimator.append(('SVC', SVC(gamma='auto', probability=True)))
estimator.append(('DTC', DecisionTreeClassifier()))

# Voting Classifier with hard voting
vot_hard = VotingClassifier(estimators=estimator, voting='hard')
vot_hard.fit(X_train, y_train)
y_pred = vot_hard.predict(X_test)

# using accuracy_score metric to predict accuracy
score = accuracy_score(y_test, y_pred)
print("DONALD TRUP % d" % score)

# Voting Classifier with soft voting
vot_soft = VotingClassifier(estimators=estimator, voting='soft')
vot_soft.fit(X_train, y_train)
y_pred = vot_soft.predict(X_test)

# using accuracy_score
score = accuracy_score(y_test, y_pred)
print("JOE BIDEN % d" % score)
Exemplo n.º 32
0
plt.plot(fpr_sv, tpr_sv, color='darkorange', lw=lw, label="ROC Curve (area = %0.2f)" % roc_auc_sv)
plt.plot(fpr_sv_adv, tpr_sv_adv, color='green', lw=lw, label="ROC Curve adv. (area = %0.2f)" % roc_auc_sv_adv)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC SVM (class=Normal)")
plt.legend(loc="lower right")
plt.savefig('ROC_SVM.png')
print()

print()
print("=============================== Voting CLassifier ==============================")
vot = VotingClassifier(estimators=[('dt', dt), ('rf', rf), ('sv', sv)], voting='hard')
vot.fit(X_train_scaled, y_train_l)
y_pred = vot.predict(X_test_scaled)

# Calculate FPR for normal class only
fpr_vot, tpr_vot, _ = roc_curve(y_test_l, y_pred, pos_label=1, drop_intermediate=False)

roc_auc_vot = auc(fpr_vot, tpr_vot)
print("Accuracy score: {}".format(accuracy_score(y_test_l, y_pred)))
print("F1 Score: {}".format(f1_score(y_test_l, y_pred, average='micro')))
print("AUC score: {}".format(roc_auc_vot))

# Predict using adversarial test samples
y_pred_adv = vot.predict(X_adv)
fpr_vot_adv, tpr_vot_adv, _ = roc_curve(y_test_l, y_pred_adv, pos_label=1, drop_intermediate=False)
roc_auc_vot_adv = auc(fpr_vot_adv, tpr_vot_adv)
print("Accuracy score adversarial: {}".format(accuracy_score(y_test_l, y_pred_adv)))
Exemplo n.º 33
0
clf2 = xgb.XGBClassifier()  # Xgboost
eclf = VotingClassifier(
    estimators=[('lr', clf1), ('xgb', clf2)],
    voting='soft')  # ensemble of Logistic Regression and Xgboost

# Cross Validation
# for clf, label in zip([clf1, clf2, eclf], ['Logistic Regression', 'Xgboost', 'Ensemble']):
#     scores = cross_val_score(clf, X_train, y_train, cv=3, scoring='log_loss')
#     print("Log loss: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

# Read test data
test_ids = list()
with open('test.csv', 'r') as f:
    next(f)
    for line in f:
        test_ids.append(line[:-2])

eclf.fit(X_train, y_train)
y_pred = eclf.predict_proba(X_test)

# Write predictions to a file
with open('sample_submission.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = eclf.classes_.tolist()
    lst.insert(0, "Article")
    writer.writerow(lst)
    for i, test_id in enumerate(test_ids):
        lst = y_pred[i, :].tolist()
        lst.insert(0, test_id)
        writer.writerow(lst)
        print("\t\t* {0}: {1}".format(par, dict_clf[clf]['best_par'][par]))

# ## 3.c. Test set predictions
# Our three classifiers have equivalent accuracy over the evaluation set. We can then let them vote using `VotingClassifier`.

# In[47]:

from sklearn.ensemble import VotingClassifier

estimators = [('RF', dict_clf['RF']['best_clf']),
              ('GB', dict_clf['GB']['best_clf']),
              ('ADB', dict_clf['ADB']['best_clf'])]

# Instanciate the VotingClassifier using the soft voting
voter = VotingClassifier(estimators=estimators, voting='soft', n_jobs=-1)
voter.fit(X_train, y_train)

pred = voter.predict(X_test).astype(int)

# In[48]:

# Calculate the known survival rate in the training set
known = train.Survived.values
nb_survived = 0
for i in known:
    if i == 1:
        nb_survived += 1
print("Number of survivors in training set: {0} over {1} "
      "({2:.2%})".format(nb_survived, len(known), nb_survived / len(known)))

# Calculate the predicted survival rate in the test set
VT_NonScaled_cross_val_scores = cross_val_score(VT_classifier_nonscaled,
                                                X_train,
                                                y_train,
                                                cv=10,
                                                scoring='accuracy')
print(
    "The 10 fold cross validation score based on Voting Classifier(Non-Scaled) is: %0.3f(+/-%0.3f)"
    % (VT_NonScaled_cross_val_scores.mean(),
       VT_NonScaled_cross_val_scores.std() * 2))

# In[31]:

if VT_NonScaled_cross_val_scores.mean() > 0.97:
    print("The Voting Classifier (Non Scaled) is overfitting in this case.")
else:
    VT_classifier_nonscaled.fit(X_train, y_train)
    VT_NonScaled_predicted = VT_classifier_nonscaled.predict(X_test)
    VT_NonScaled_prob_default = np.sum(VT_NonScaled_predicted) / len(
        VT_NonScaled_predicted)
    print(
        "The Default Probability based on Voting Classifier(Non Scaled) is :",
        '%.3f' % VT_NonScaled_prob_default)
    VT_NonScaled_accuracy = VT_classifier_nonscaled.score(X_test, y_test)
    print("The accuracy of Voting Classifier(Non Scaled) on test set is : ",
          '%.3f' % VT_NonScaled_accuracy)

# In[32]:

#output the result into the existing evaluation dataframe to compare with other models
new_evaluation = pd.DataFrame({
    'Model': ["Voting Classifier_NonScaled"],
# stack base predicts for training meta model
#stacked_predictions = np.column_stack((rf_fit.predict(x_train),et_fit.predict(x_train),ada_fit.predict(x_train),gb_fit.predict(x_train),svc_fit.predict(x_train)))

polymetamnalicac

# train meta model
from sklearn.linear_model import LinearRegression
#meta_model = LinearRegression()
#meta_model.fit(stacked_predictions, t_train)
from sklearn import preprocessing
satsuki = pd.read_csv('haruten.csv', index_col=0)
mm = preprocessing.MinMaxScaler()  # インスタンスの作成
satsuki_seiki = mm.fit_transform(satsuki)
arima = pd.read_csv('arima.csv', index_col=0)
from sklearn.ensemble import VotingClassifier

estimators = [
    ('svc', SVC()),
    ('rf', RandomForestClassifier()),
    ('et', ExtraTreesClassifier()),
    ('ada', AdaBoostClassifier()),
    ('gb', GradientBoostingClassifier()),
]

sum = 0
buy = 0
voting = VotingClassifier(estimators)
voting.fit(x, t)
print(voting.predict(satsuki_seiki))
Exemplo n.º 37
0
    # hard voting

    # moon data set
    X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    # build models
    log_clf = LogisticRegression(random_state=42)
    rnd_clf = RandomForestClassifier(random_state=42)
    svm_clf = SVC(random_state=42)

    voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf),
                                              ('svc', svm_clf)],
                                  voting='hard')  # hard voting
    print(voting_clf.fit(X_train, y_train))
    '''
        VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                                                                intercept_scaling=1, max_iter=100, multi_class='ovr', 
                                                                n_jobs=1,penalty='l2', random_state=42, 
                                                                solver='liblinear', tol=0.0001,verbose=0, 
                                                                warm_start=False)), 
                                     ('rf', RandomFor...f', max_iter=-1, probability=False, random_state=42, 
                                        shrinking=True,tol=0.001, verbose=False))],
                                    flatten_transform=None, n_jobs=1, voting='hard', weights=None)
         '''  # RandomForests classifier details info needed later

    # show each classifier's accuarcy score
    for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
Exemplo n.º 38
0
        clf = RandomizedSearchCV(model,
                                 param_distributions=param,
                                 cv=5,
                                 verbose=0,
                                 n_jobs=-1,
                                 n_iter=200)
        print("Training model: {}".format(model.__class__.__name__))
        lst_best_models.append((model_name, clf.fit(X_train_fit, y_train)))
    else:
        lst_best_models.append((model_name, model.fit(X_train_fit, y_train)))

# Ensemble of models
from sklearn.ensemble import VotingClassifier
from sklearn.cross_validation import cross_val_score
eclf = VotingClassifier(estimators=lst_best_models, voting='soft')
eclf.fit(X_train_fit, y_train)
scores = cross_val_score(eclf, X_train_fit, y_train, cv=5, scoring='accuracy')
print(scores)
'''
clf = RandomizedSearchCV(model, param_distributions=params, cv=5, verbose=1, n_jobs=-1, n_iter=100)

logging.info("Training...")
best_model = clf.fit(X_train_fit, y_train)

# Print best results (rank by high test score and low std)
train_result = pd.DataFrame(clf.cv_results_)
train_result.sort_values(by=['mean_test_score', 'std_test_score'], ascending=[False, True], inplace=True)
print(train_result[['mean_test_score', 'std_test_score']].head())

for param in params:
    print('Parameter: {}, best value={}'.format(param, best_model.best_estimator_.get_params()[param]))
Exemplo n.º 39
0
summary['Median'] = summary.median(1)
summary.sort_values('Median', ascending=False)

# In[ ]:

clf_vote = VotingClassifier(estimators=[
    ('knn', clf_knn),
    ('svm', clf_svm),
    ('extra', clf_ext),
    ('xgb', clf_xgb),
    ('percep', clf_pctr),
    ('logistics', clf_log),
],
                            weights=[2, 2, 3, 3, 1, 2],
                            voting='hard')
clf_vote.fit(X, y)

scores = cross_val_score(clf_vote, X, y, cv=5, scoring='accuracy')
print('Voting: Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std()))

# In[ ]:

train = X

ntrain = train.shape[0]
ntest = test.shape[0]
SEED = 0
NFOLDS = 5
kf = KFold(n_splits=NFOLDS, random_state=SEED)

Exemplo n.º 40
0
gnb.fit(train_x, train_y)
gnb_pred_y = gnb.predict(val_x)
print('GaussianNB Accuracy:', metrics.accuracy_score(val_y, gnb_pred_y))

bern = BernoulliNB()
bern.fit(train_x, train_y)
bern_pred_y = bern.predict(val_x)
print('BernoulliNB Accuracy:', metrics.accuracy_score(val_y, bern_pred_y))

multi = MultinomialNB()
multi.fit(train_x, train_y)
multi_pred_y = multi.predict(val_x)
print('MultinomialNB Accuracy:', metrics.accuracy_score(val_y, multi_pred_y))

log = LogisticRegression()
log.fit(train_x, train_y)
log_pred_y = log.predict(val_x)
print('Logistic Regression Accuracy:',
      metrics.accuracy_score(val_y, log_pred_y))

sgd = SGDClassifier()
sgd.fit(train_x, train_y)
sgd_pred_y = sgd.predict(val_x)
print('SGDClassifier Accuracy:', metrics.accuracy_score(val_y, sgd_pred_y))

vote = VotingClassifier([('gnb', gnb), ('bern', bern), ('multi', multi),
                         ('lr', log), ('sgd', sgd)])
vote.fit(train_x, train_y)
vote_pred_y = vote.predict(val_x)
print('Vote Accuracy:', metrics.accuracy_score(val_y, vote_pred_y))
Exemplo n.º 41
0
            min_samples_split=2, n_estimators='warn', random_state=42,
            verbose=0, warm_start=False)
# Ensemble: Bagging
bagging = BaggingClassifier(rf, n_estimators=500, max_samples=1.0, random_state=42)
# Ensemble: Weighted Voting - not included in the report
eclf3 = VotingClassifier(estimators=[
       ('CART', c), ('ANN', ann), ('BAG', bagging)],
       voting='soft', weights=[14,3,3],
       flatten_transform=True)

# Training models
H = model.fit(x_train, y_train2, validation_data=(x_test, y_test2),
	epochs=120, batch_size=100)
c.fit(x_train, y_train)
ann.fit(x_train, y_train)
eclf3 = eclf3.fit(x_train, y_train)
bagging.fit(x_train, y_train)
rf.fit(x_train, y_train)

# Plot CART rules
feature_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
                      'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal'] 
target_names = ['0', '1', '2', '3', '4']
dot_data = tree.export_graphviz(c, out_file='tree.dot', 
                                feature_names=feature_names,  
                                class_names=target_names,  
                                filled=True, rounded=True,  
                                special_characters=True)
Source.from_file('tree.dot')
graph = graphviz.Source(dot_data)
Exemplo n.º 42
0
## Boosting ##
skf = StratifiedKFold(target, n_folds = 5)
for train, test in skf:
    clf = BaggingClassifier(base_estimator = RandomForestClassifier(class_weight = "balanced_subsample"), n_estimators = 250, bootstrap = True, bootstrap_features = True, n_jobs = -1)
    clf.fit(matrix[train], target[train])
    yPred = clf.predict_proba(matrix[test])[:,1]
    print roc_auc_score(target[test], yPred), "BaggingRandomForest"

## Voting ##
skf = StratifiedKFold(target, n_folds = 5)
for train, test in skf:
    clf1 = RandomForestClassifier(class_weight = "balanced_subsample", n_jobs = -1)
    clf2 = svm.SVC(kernel = "linear", class_weight = "balanced", probability = True, C = 10)
    vclf = VotingClassifier(estimators = [('rf',clf1),('svc', clf2)], voting = "soft")
    vclf.fit(matrix[train], target[train])
    yPred = vclf.predict(matrix[test])
    print roc_auc_score(target[test], yPred)

## Extra Trees ##
skf = StratifiedKFold(target, n_folds = 5)
for train, test in skf:
    clf = ExtraTreesClassifier(n_estimators = 100, class_weight = "balanced_subsample", n_jobs = -1, bootstrap = True)
    clf.fit(matrix[train], target[train])
    yPred = clf.predict_proba(matrix[test])[:,1]
    print roc_auc_score(target[test], yPred), "ERTrees"

## Gradient Boost ##
skf = StratifiedKFold(target, n_folds = 5)
for train, test in skf:
    clf = GradientBoostingClassifier(n_estimators = 250, max_features = "auto", init = RandomForestClassifier(class_weight = "balanced_subsample", n_jobs = -1))
Exemplo n.º 43
0
models = pd.DataFrame({'Model': ["Support Vector Machine", "KNN", "Logistic Regression",
                                 "Decision Tree", "Perceptron"],
                       'Score': [score_svm, score_knn, score_lgr, score_dtree, score_pctr]})
models.sort_values(by='Score', axis=0, ascending=False)

# Ensemble methods
classifier_vote = VotingClassifier(estimators=[
    ('knn', classifier_knn),
    ('svm', classifier_svm),
    ("logistic", classifier_lgr),
    ("decisiontree", classifier_dtree),
    ("perceptron", classifier_pctr)],
    weights=[2, 3, 2, 3, 1],
    voting='hard'
)
classifier_vote.fit(X, y)
score_votes = cross_val_score(classifier_vote, X, y, cv=5, scoring='accuracy')
print("Voting: Accuracy: %0.2f (+/- %0.2f)" % (score_votes.mean(), score_votes.std()))

train = X
test = test_set
n_train = train.shape[0]
n_test = test.shape[0]
SEED = 0
NFOLDS = 5
kf = KFold(n_splits=NFOLDS, random_state=SEED)


def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((n_train,))
    oof_test = np.zeros((n_test,))
Exemplo n.º 44
0
x_train, x_valid, x_test, y_train, y_valid, y_test = generate.get_data()
x_train = [x.reshape(1, -1)[0] for x in x_train]
x_valid = [x.reshape(1, -1)[0] for x in x_valid]
x_test = [x.reshape(1, -1)[0] for x in x_test]
pca = PCA(svd_solver='randomized', n_components=n_component)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)

NB = pickle.load(open('model_NB.pkl', 'rb'))
KNN = KNeighborsClassifier(n_neighbors=3,
                           weights='distance').fit(x_train, y_train)
Dtree = pickle.load(open('model_Dtree.pkl', 'rb'))
model = VotingClassifier(estimators=[('NB', NB), ('KNN', KNN),
                                     ('Dtree', Dtree)],
                         voting='hard',
                         weights=[0.32, 0.31, 0.37])
model.fit(x_train, y_train)
y_pred = np.array(model.predict(x_test))

print(y_test)
print(y_pred)

# Pickle dictionary using protocol 0.
pickle.dump(model, open('model_Vote.pkl', 'wb'))

print('Confusion matrix: ', confusion_matrix(y_test, y_pred))
print('Accuracy score: ', accuracy_score(y_test, y_pred))
print('Precision score: ', precision_score(y_test, y_pred, average='weighted'))
print('Recall score: ', recall_score(y_test, y_pred, average='weighted'))
print('F1 score: ', f1_score(y_test, y_pred, average='weighted'))
# Creating the VotingClassifier using soft voting as the sub classifiers are
# well trained to the data due to gridsearchcv.
softVoteC_drop = VotingClassifier(estimators=[
    ('rfc', drop_rf), ('dt', drop_dt), ('ada', drop_ada), ('bag', drop_bag),
    ('grad', drop_grad), ('xgb', drop_xgb), ('et', drop_et)
],
                                  voting='soft',
                                  n_jobs=-1)

# Repeat for other Data set
softVoteC_per = VotingClassifier(estimators=[('rfc', per_rf), ('dt', per_dt),
                                             ('ada', per_ada),
                                             ('bag', per_bag),
                                             ('grad', per_grad),
                                             ('xgb', per_xgb), ('et', per_et)],
                                 voting='soft',
                                 n_jobs=-1)

# Fitting the VoteClassifiers
softVoteC_drop = softVoteC_drop.fit(drop_f_train, drop_l_train)
softVoteC_per = softVoteC_per.fit(per_f_train, per_l_train)

# Dumping the voteClassifiers to pickle files for saving and downloading
with open('Soft_voteC_drop.pkl', 'wb') as pf:
    pickle.dump(softVoteC_drop, pf)

with open('Soft_voteC_per.pkl', 'wb') as pf:
    pickle.dump(softVoteC_per, pf)

print('Done!!!!')
                                                shuffle=True)

mlp = SVC(C=1.0, kernel='linear', probability=True, gamma=0.1, tol=0.001)

mlp.fit(xTrain, yTrain)
score = str(mlp.score(xTest, yTest))
print('SVC Score ' + score)

clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(xTrain, yTrain)
score = str(clf.score(xTest, yTest))
print('AdaBoostClassifier Score ' + score)

rclf = DecisionTreeClassifier(max_depth=20, random_state=0)
rclf.fit(xTrain, yTrain)
score = str(rclf.score(xTest, yTest))
print('DecisionTreeClassifier Score ' + score)

eclf2 = VotingClassifier(estimators=[('svc', mlp), ('adaboost', clf),
                                     ('rf', rclf)],
                         voting='soft')
eclf2.fit(xTrain, yTrain)
score = str(eclf2.score(xTest, yTest))
print('VotingClassifier Score ' + score)

import pickle

# save the model to disk
filename = 'eclf2.sav'
pickle.dump(eclf2, open(filename, 'wb'))
Exemplo n.º 47
0
ensemble_results = pd.concat([
    test_Survived_RFC, test_Survived_ExtC, test_Survived_AdaC,
    test_Survived_GBC, test_Survived_SVMC
],
                             axis=1)

g = sns.heatmap(ensemble_results.corr(), annot=True)

#########################################################################################
votingC = VotingClassifier(estimators=[('rfc', RFC_best), ('extc', ExtC_best),
                                       ('svc', SVMC_best), ('adac', ada_best),
                                       ('gbc', GBC_best)],
                           voting='soft',
                           n_jobs=1)

votingC = votingC.fit(all_X, all_y)
#########################################################################################

test_Survived = pd.Series(votingC.predict(test[predictors]), name="Survived")

submit("ensemble_python_voting.csv", votingC)

################################## Logistic Regression ##################################
logreg = LogisticRegression(random_state=0)
logreg.fit(all_X, all_y)

rfe = RFE(logreg, 3)
rfe = rfe.fit(all_X, all_y)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)
Exemplo n.º 48
0
svm_predictions = svm_model_linear.predict(X_test)

# model accuracy for X_test
accuracy = svm_model_linear.score(X_test, y_test)

# creating a confusion matrix
cm = confusion_matrix(y_test, svm_predictions)
print(cm)

clf = RandomForestClassifier(n_estimators=5, max_depth=5, random_state=0)
clf.fit(X_train, y_train)

#print(clf.predict([['2018', '04', '01']]))
print(clf.score(X_test, y_test, sample_weight=None))

clf2 = GradientBoostingClassifier(n_estimators=10,
                                  learning_rate=1.0,
                                  max_depth=5,
                                  random_state=0).fit(X_train, y_train)
print(clf2.score(X_test, y_test))
#print(clf2.predict([['2018', '04', '01']]))

eclf1 = VotingClassifier(estimators=[('lsvm', svm_model_linear), ('rf', clf),
                                     ('gbc', clf2)],
                         voting='hard')
eclf1 = eclf1.fit(X_train, y_train)
print(eclf1.score(X_test, y_test, sample_weight=None))
#print(eclf1.predict([['2018', '04', '01']]))

filename = 'finalized_model.sav'
pickle.dump(eclf1, open(filename, 'wb'))
Exemplo n.º 49
0
train_size = 10000

tf = TfidfVectorizer(max_features=30000,
                     ngram_range=(1, 3),
                     stop_words='english')
tf.fit(data.text)
transformed = tf.transform(data.text)

x_data = transformed[:train_size].toarray()
y_data = data.polarity[:train_size].values

voting = VotingClassifier([('LR', LogisticRegression()),
                           ('NB', MultinomialNB()),
                           ('Ridge', RidgeClassifier())])

voting.fit(x_data, y_data)


# Define the streaming classifier
class StreamClassifier(StreamListener):
    def __init__(self, classifier, vectorizer, api=None):
        super().__init__(api)
        self.clf = classifier
        self.vec = vectorizer

    # What to do when a tweet arrives
    def on_data(self, data):
        # Create a json object
        json_format = json.loads(data)
        # Get the tweet's text
        text = json_format['text']
Exemplo n.º 50
0
svc = SVC(kernel="linear")
lr = LinearRegression(normalize=True)
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
rfc = RandomForestClassifier(n_estimators=10)
lor = LogisticRegression(random_state=1)
gnb = GaussianNB()
vot = VotingClassifier(estimators=[('lr', lor), ('rf', rfc), ('gnb', gnb),
                                   ('knn', knn)],
                       voting='hard')
lr.fit(x_train, y_train)
svc.fit(x_train, y_train)
knn.fit(x_train, y_train)
rfc.fit(x_train, y_train)
lor.fit(x_train, y_train)
gnb.fit(x_train, y_train)
vot.fit(x_train, y_train)
print("LogisticRegression", lor.score(x_test, y_test))
print("GaussianNB", gnb.score(x_test, y_test))
print("RandomForestClassifier ", rfc.score(x_test, y_test))
print("KNeighborsClassifier ", knn.score(x_test, y_test))
print("SVC ", svc.score(x_test, y_test))
print("LinearRegression ", lr.score(x_test, y_test))
print('VotingClassifier', vot.score(x_test, y_test))
N = 7
x = range(N)
y = [
    lor.score(x_test, y_test),
    gnb.score(x_test, y_test),
    rfc.score(x_test, y_test),
    knn.score(x_test, y_test),
    svc.score(x_test, y_test),
Exemplo n.º 51
0
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier

Classification_models=[('LogisticRegression',LogisticRegression()),('StochasticGDC',SGDClassifier()),('KNC',KNeighborsClassifier()),('SVC',SVC()),
                       ('LinearSVC',LinearSVC()),('GNaiveBayes',GaussianNB()),('MNaiveBayes',MultinomialNB()),('DTree',DecisionTreeClassifier()),
                       ('MLPerceptronC',MLPClassifier()),('RF',RandomForestClassifier()),('ET',ExtraTreesClassifier()),('AdaBoostC',AdaBoostClassifier()),
                       ('GBC',GradientBoostingClassifier()),('XGBC',XGBClassifier())]
result=[]
names=[]
for name,model in Classification_models:
    cvresult=cross_val_score(model,X,y,cv=5,n_jobs=-1,scoring = 'accuracy')
    result.append(cvresult.mean())
    names.append(name)
    print("%s gives %f " % (name, cvresult.mean()))
params={'C':[0.01,0.1,1],'gamma':[1,0.1,0.01],'kernel':['linear', 'poly', 'rbf'] }
from sklearn.model_selection import GridSearchCV
grid=GridSearchCV(SVC(),param_grid=params,n_jobs=-1,cv=5)
gridfit=grid.fit(X,y)
gridfit.best_score_
gridfit.best_params_


vc=VotingClassifier(estimators=[('Support Vector Classifier',SVC(C=1, gamma=1, kernel='linear')),
                                ('Gaussian Naive Bayes',GaussianNB())])

vc.fit(X,y)
predictions = vc.predict(test1)
submission = pd.DataFrame({'id':test['id'], 'type':predictions})
submission['type']=label.inverse_transform(submission['type'])
submission.to_csv('submission.csv', index=False)
NN.fit(trainData_x, trainData_y)
predict = NN.predict(testData_x)
score = NN.score(testData_x, testData_y)
S = 'Overall Accuracy: ' + repr(score * 100) + ' %' + '\n'
print(S)
confusionMatrix = confusion_matrix(testData_y, predict)
print('Confusion Matrix: ')
print(confusionMatrix)
print('\n')

#Unweighted Majority Voting Classifier
print("********TASK 2.2: VOTING CLASSIFIER: UNWEIGHTED******** \n")
VCU = VotingClassifier(estimators=[('gnb', NB), ('lr', LR), ('dt', DT),
                                   ('knn', KNN), ('mlp', NN)],
                       voting='soft')
VCU.fit(trainData_x, trainData_y)
predict = VCU.predict(testData_x)
score = VCU.score(testData_x, testData_y)
S = 'Overall Accuracy: ' + repr(score * 100) + ' %' + '\n'
print(S)
confusionMatrix = confusion_matrix(testData_y, predict)
print('Confusion Matrix: ')
print(confusionMatrix)
print('\n')

#Weighted Majority Voting Classifier
print("********TASK 2.3: VOTING CLASSIFIER: WEIGHTED******** \n")
p, q, r, s, t = 0, 0, 0, 0, 0
maxScore = 0

#Grid search for weights
Exemplo n.º 53
0
    presort=False,
    random_state=None,
    splitter='best'),
                  bootstrap=True,
                  bootstrap_features=False,
                  max_features=1.0,
                  max_samples=0.5,
                  n_estimators=20,
                  n_jobs=1,
                  oob_score=False,
                  random_state=None,
                  verbose=0,
                  warm_start=False)
print(bg.score(data_pd1, Y_test))

print(bg.score(data_pd, Y_train))

lr = LogisticRegression()
dt = DecisionTreeClassifier()
rf1 = RandomForestClassifier()
svm = SVC(kernel='poly', degree=2)

evc = VotingClassifier(estimators=[('lr', lr), ('dt', dt), ('rf1', rf1),
                                   ('svm', svm)],
                       voting='hard')

evc.fit(data_pd, Y_train)
print(evc.score(data_pd1, Y_test))

print(evc.score(data_pd, Y_train))
Exemplo n.º 54
0
# In[110]:
"""
train an ensemble model using the previous models
"""
from sklearn.ensemble import VotingClassifier
#create a dictionary of our models
estimators = [('blob', blob_classifier), ('rf', text_classifier), ('net', net),
              ('features', feature_classifier), ('net2', net2)]
#create our voting classifier, inputting our models
ensemble = VotingClassifier(estimators, voting='hard')

# In[111]:

#fit model to training data
ensemble.fit(processed_features, labels)
ensemble_pred = ensemble.predict(processed_features)
#test our model on the test data
print(accuracy_score(labels, ensemble_pred))

# In[112]:

test_predictions = ensemble.predict(X_test)

# In[113]:

final_predictions = df_test[['review_id']]

# In[114]:

final_predictions['is_good_rating'] = test_predictions
#create fucntion alaises
knc = KNeighborsClassifier(n_neighbors=19)
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(random_state=226)
bc = BaggingClassifier(DecisionTreeClassifier(), n_estimators=9)
adc = AdaBoostClassifier(DecisionTreeClassifier(),
                         n_estimators=5,
                         learning_rate=1)
lr = LogisticRegression()

vc = VotingClassifier(estimators=[('lr', lr), ('dtc', dtc), ('knc', knc),
                                  ('rfc', rfc), ('bc', bc), ('adc', adc)],
                      voting='hard')  #get the VotingClassifier Function

vc.fit(X_train, y_train.values.ravel())  #create model using train set

y_pred = vc.predict(X_test)  #predict the values
cm = confusion_matrix(y_test, y_pred)  #get the confusion matrix
diag_sum = 0
for i in range(0, 3):
    diag_sum = diag_sum + cm[i, i]
total = 0
for i in range(0, 3):
    for j in range(0, 3):
        total = total + cm[i, j]
accuracy = diag_sum / total  # accuracy of the confusion matrix
#get accuracy and confusion matrix
print(accuracy)
print(cm)
Exemplo n.º 56
0
    # pcafeatures = pca.transform(features)

    features = sc.fit_transform(features)

    train, test, train_labels, test_labels = train_test_split(features,
                                                              labels,
                                                              test_size=0.2)

    # Train our classifier
    model2 = clf1.fit(train, train_labels)
    model3 = clf2.fit(train, train_labels)
    model4 = lda.fit(train, train_labels)
    model5 = kneigh.fit(train, train_labels)
    model7 = lr.fit(train, train_labels)

    modelVoting = voting.fit(train, train_labels)

    modelEnsemble1 = ensemble1.fit(train, train_labels)
    modelEnsemble2 = ensemble2.fit(train, train_labels)
    modelEnsemble3 = ensemble3.fit(train, train_labels)

    # Make predictions
    preds2 = clf1.predict(test)
    preds3 = clf2.predict(test)
    preds4 = lda.predict(test)
    preds5 = kneigh.predict(test)
    preds7 = lr.predict(test)

    predVoting = voting.predict(test)

    predEnsemble1 = ensemble1.predict(test)
Exemplo n.º 57
0
df_test = pd.concat([test_pred1, test_pred2,test_pred3], axis=1)

ytest=pd.DataFrame(y_test)
model = RandomForestClassifier(random_state=1,criterion="entropy")
model.fit(df_train,y_train)
model.score(df_test,y_test)



#Bagging 
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, VotingClassifier
bg = BaggingClassifier(clf3, max_samples= 0.5, max_features = 1.0, n_estimators = 20)
bg.fit(x_train,y_train)
bg.score(x_test,y_test)

#Boosting - Ada Boost
adb = AdaBoostClassifier(LogisticRegression(),n_estimators = 5, learning_rate = 1)
adb.fit(x_train,y_train)
adb.score(x_test,y_test)

evc = VotingClassifier( estimators= [('svm',clf),('dt',clf1),('nb',clf2),('lr',clf3),('knn',clf6)], voting = 'hard')
evc.fit(x_train,y_train)
evc.score(x_test, y_test)
x_train[0]

###working directory
import joblib  # Save to filepip 
joblib_file = "model.pkl"   
joblib.dump(evc, joblib_file)

Exemplo n.º 58
0
def select_model(df, features):

    all_X = df[features]
    all_y = df["Survived"]

    # List of dictionaries, each containing a model name,
    # it's estimator and a dict of hyperparameters
    models = [{
        "name": "LogisticRegression",
        "estimator": LogisticRegression(),
        "hyperparameters": {
            "solver": ["newton-cg", "lbfgs", "liblinear"]
        }
    }, {
        "name": "KNeighborsClassifier",
        "estimator": KNeighborsClassifier(),
        "hyperparameters": {
            "n_neighbors": range(1, 30, 2),
            "weights": ["distance", "uniform"],
            "algorithm": ["ball_tree", "kd_tree", "brute"],
            "p": [1, 2]
        }
    }, {
        "name": "RandomForestClassifier",
        "estimator": RandomForestClassifier(random_state=1),
        "hyperparameters": {
            "n_estimators": [4, 6, 9, 15],
            "criterion": ["entropy", "gini"],
            "max_depth": [2, 5, 10],
            "max_features": ["log2", "sqrt"],
            "min_samples_leaf": [1, 5, 8],
            "min_samples_split": [2, 3, 5],
        }
    }]

    for model in models:
        print(model['name'])
        print('-' * len(model['name']))

        grid = GridSearchCV(model["estimator"],
                            param_grid=model["hyperparameters"],
                            cv=10,
                            n_jobs=3)
        grid.fit(all_X, all_y)
        model["best_params"] = grid.best_params_
        model["best_score"] = grid.best_score_
        model["best_model"] = grid.best_estimator_

        print("Best Score: {}".format(model["best_score"]))
        print("Best Parameters: {}\n".format(model["best_params"]))

    #create ensemble of best models

    votingC = VotingClassifier(estimators=[(model["name"], model["best_model"])
                                           for model in models],
                               voting='soft',
                               n_jobs=4)
    votingC.fit(all_X, all_y)
    scores = cross_val_score(votingC, all_X, all_y, cv=10)
    accuracy = np.mean(scores)
    models.append({
        "name": "VotingClassifier",
        "best_model": votingC,
        "best_score": accuracy
    })
    #print results to screen
    print(models[3]['name'])
    print('-' * len(models[3]['name']))
    print("Best Score: {}".format(models[3]["best_score"]))

    return models
Exemplo n.º 59
0
def myclassify_practice_set(numfiers,xtrain,ytrain,xtltrain,xtltest,xtest,ytarget=None,testing=False,grids='ABCDEFGHI'):
    #NOTE we might not need xtltrain
    # xtrain and ytrain are your training set. xtltrain is the indices of corresponding recordings in xtrain and ytrain. these will always be present
    #xtest is your testing set. xtltest is the corresponding indices of the recording. for the practice set xtltest = xtrunclength
    # ytest is optional and depends on if you are using a testing set or the practice set

    # remove NaN, Inf, and -Inf values from the xtest feature matrix
    xtest,xtltest,ytarget = removeNanAndInf(xtest,xtltest,ytarget)
    # print 'finished removal of Nans'

    ytrain = np.ravel(ytrain)
    ytarget = np.ravel(ytarget)


    #if xtest is NxM matrix, returns Nxnumifiers matrix where each column corresponds to a classifiers prediction vector
    count = 0
    # print numfiers

    predictionMat = np.empty((xtest.shape[0],numfiers))
    predictionStringMat = []
    finalPredMat = []
    targetStringMat = []
    targets1 = []
    predictions1 = []

    # svc1 = SVC()
    # svc1.fit(xtrain,ytrain)
    # ytest = svc1.predict(xtest)
    # predictionMat[:,count] = ytest
    # count+=1
    if count < numfiers:
        # votingClassifiers combine completely different machine learning classifiers and use a majority vote
        clff1 = SVC()
        clff2 = RFC(bootstrap=False)
        clff3 = ETC()
        clff4 = neighbors.KNeighborsClassifier()
        clff5 = quadda()



        eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)])
        eclf = eclf.fit(xtrain,ytrain)
        #print(eclf.score(xtest,ytest))
        # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']):
        #     cla
        #     scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy')
        #     print ()
        ytest = eclf.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:

        bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False)
        bagging2.fit(xtrain,ytrain)
        #print bagging2.score(xtest,ytest)
        ytest = bagging2.predict(xtest)
        predictionMat[:,count] = ytest
        count += 1


    if count < numfiers:

        tree2 = ETC()
        tree2.fit(xtrain,ytrain)
        ytest = tree2.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        bagging1 = BaggingClassifier(ETC())
        bagging1.fit(xtrain,ytrain)
        #print bagging1.score(xtest,ytest)
        ytest = bagging1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        svc1 = SVC()
        svc1.fit(xtrain,ytrain)
        ytest = svc1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        # Quadradic discriminant analysis - classifier with quadratic decision boundary -
        qda = quadda()
        qda.fit(xtrain,ytrain)
        #print(qda.score(xtest,ytest))
        ytest = qda.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:

        tree1 = DTC()
        tree1.fit(xtrain,ytrain)
        ytest = tree1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user.
        knn1.fit(xtrain,ytrain)
        ytest = knn1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        # linear discriminant analysis - classifier with linear decision boundary -
        lda = linda()
        lda.fit(xtrain,ytrain)
        ytest = lda.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree3 = RFC()
        tree3.fit(xtrain,ytrain)
        ytest = tree3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False)
        bagging3.fit(xtrain,ytrain)
        ytest = bagging3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False)
        bagging4.fit(xtrain,ytrain)
        ytest = bagging4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        tree4 = RFC(bootstrap=False)
        tree4.fit(xtrain,ytrain)
        ytest = tree4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree6 = GBC()
        tree6.fit(xtrain,ytrain)
        ytest = tree6.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10)
        knn2.fit(xtrain,ytrain)
        ytest = knn2.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3)
        knn3.fit(xtrain,ytrain)
        ytest = knn3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree')
        knn4.fit(xtrain,ytrain)
        ytest = knn4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree')
        knn5.fit(xtrain,ytrain)
        ytest = knn5.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        ncc1 = NearestCentroid()
        ncc1.fit(xtrain,ytrain)
        ytest = ncc1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree5 = ABC()
        tree5.fit(xtrain,ytrain)
        ytest = tree5.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    # print xtltest
    # print len(ytest)
    for colCount in range(predictionMat.shape[1]):
        tempCol = predictionMat[:,colCount]
        if testing:
            modeCol = temppredWindowVecModeFinder(tempCol,xtltest,4,grids,isPrint=0)
        else:
            modeCol = predWindowVecModeFinder(tempCol,xtltest,4,isPrint=0)

        ytarg = predWindowVecModeFinder(ytarget,xtltest,1,isPrint=0)
        if testing:
             modeStr = temppredVec2Str(modeCol,grids)
        else:
            modeStr = predVec2Str(modeCol)
        modeStrans = predVec2Str(ytarg)
        predictionStringMat.append(modeStr)
        predictions1.append(modeCol)
        finalPredMat += map(int,modeCol)
        targetStringMat.append(modeStrans)
        targets1.append(ytarg)
        if testing == False:
            if ytarget != None:
                #print targets1
                #print ""
                #print predictions1
                confusionme = confusion_matrix(targets1[0],predictions1[0])
                #print "Confusion Matrix is: "
                #print confusionme


    return predictionStringMat, targetStringMat, finalPredMat
Exemplo n.º 60
0
g = plot_learning_curve(gsExtC.best_estimator_,"ExtC ExtraTrees learning curves",X_train,Y_train,cv=kfold)
g = plot_learning_curve(gsGBC.best_estimator_,"GBC Gradient Boost learning curves",X_train,Y_train,cv=kfold)
g = plot_learning_curve(gsrandom_forest.best_estimator_,"RandomForest learning curves",X_train,Y_train,cv=kfold)
g = plot_learning_curve(gsSVMC.best_estimator_,"SVMC learning curves",X_train,Y_train,cv=kfold)
test_Survived_AdaDTC = pd.Series(adaDTC_best.predict(X_test), name="AdaDTC")
test_Survived_ExtC = pd.Series(ExtC_best.predict(X_test), name="ExtC")
test_Survived_GBC = pd.Series(GBC_best.predict(X_test), name="GBC")
test_Survived_SVMC = pd.Series(SVMC_best.predict(X_test), name="SVMC")
test_Survived_random_forest = pd.Series(random_forest_best.predict(X_test), name="random_forest")

# Concatenate all classifier results
ensemble_results = pd.concat([test_Survived_AdaDTC, test_Survived_ExtC, test_Survived_GBC,test_Survived_SVMC,test_Survived_random_forest],axis=1)
g= sns.heatmap(ensemble_results.corr(),annot=True)
VotingPredictor = VotingClassifier(estimators=[('ExtC', ExtC_best), ('GBC',GBC_best),
('SVMC', SVMC_best), ('random_forest', random_forest_best)], voting='soft', n_jobs=4)
VotingPredictor = VotingPredictor.fit(X_train, Y_train)
VotingPredictor_predictions = VotingPredictor.predict(test)
test_Survived = pd.Series(VotingPredictor_predictions, name="Survived")

# Preparing data for Submission 3
test_Survived = pd.Series(VotingPredictor_predictions, name="Survived")
Submission3 = pd.concat([PassengerId,test_Survived],axis=1)
Submission3.head(15)
nrows = ncols = 2
fig, axes = plt.subplots(nrows = nrows, ncols = ncols, sharex="all", figsize=(15,7))
names_classifiers = [("AdaBoosting", adaDTC_best),("ExtraTrees",ExtC_best),
("GradientBoosting",GBC_best), ("RandomForest",random_forest_best)]

nclassifier = 0
for row in range(nrows):
    for col in range(ncols):