def classify(): linearSVM= LinearSVC( random_state=666, class_weight="balanced", max_iter=5000, C=2.0,tol=0.001, dual=True ) linearSVM_SVC= SVC( C=1, kernel="rbf", tol=1, random_state=0,gamma=1 ) logistic = LogisticRegression( fit_intercept=True,class_weight="balanced", n_jobs=-1, C=1.0, max_iter=200 ) rand_forest = RandomForestClassifier( n_estimators=403, random_state=666, max_depth=73, n_jobs=-1 ) bc=BaggingClassifier( base_estimator=logistic, n_estimators=403, n_jobs=-1, random_state=666, max_features=410) ensemble_voting=VotingClassifier([("logistic",logistic),("rand_forest",rand_forest),("sgdc",SGDClassifier())],weights=[1,1,2]) boost = AdaBoostClassifier(base_estimator=logistic) xgboost= XGBoostClassifier( n_estimators=103, seed=666, max_depth=4, objective="multi:softmax" ) return ensemble_voting
def fit(self, X, y, **fit_params): result = XGBClassifierImpl(self.max_depth, self.learning_rate, self.n_estimators, self.verbosity, self.objective, self.booster, self.n_jobs, self.nthread, self.gamma, self.min_child_weight, self.max_delta_step, self.subsample, self.colsample_bytree, self.colsample_bylevel, self.colsample_bynode, self.reg_alpha, self.reg_lambda, self.scale_pos_weight, self.base_score, self.random_state, self.seed, self.missing) result._xgboost_model = XGBoostClassifier( **self.get_params()) if fit_params is None: result._xgboost_model.fit(X, y) else: result._xgboost_model.fit(X, y, **fit_params) return result
rf_acc = cv(RandomForestClassifier(n_estimators=403,n_jobs=-1, random_state=seed),data_model.drop(columns='label',axis=1), data_model['label']) # ## Experiment 3: Added Feature + XGBoost # In[55]: from xgboost import XGBClassifier as XGBoostClassifier # In[56]: X_train, X_test, y_train, y_test = train_test_split(data_model.drop(columns='label',axis=1),data_model['label'] , test_size=0.3) precision, recall, accuracy, f1 = test_classifier(X_train, y_train, X_test, y_test, XGBoostClassifier(seed=seed)) # ## Experiment 4: Added Feature + Naive Bayes # In[57]: X_train, X_test, y_train, y_test = train_test_split(data_model.drop(columns='label',axis=1),data_model['label'] , test_size=0.3) precision, recall, accuracy, f1 = test_classifier(X_train, y_train, X_test, y_test, BernoulliNB()) # ## Test Data # In[58]:
def best_fit(X_train, y_train): log("") seed = 666 import time as ttt attributes = len(X_train.columns) examples = len(X_train) now = time() log(ttt.ctime()) # Parameters for SVM # parameters = { # "dual": [True, False], # "tol": [1e-3, 1e-4, 1e-5], # "C": [1.0, 1.5, 2.0, 5.0, 10, 100, 1000] # } # rand_search = RandomizedSearchCV(LinearSVC(max_iter=5000), param_distributions=parameters, cv=8,n_jobs=-1,n_iter=20) # # # rand_search.fit(X_train,y_train) # report(rand_search.cv_results_, 10) # log(ttt.ctime()) # log(time() - now) # return # Parameters for Bagging # parameters = { # "n_estimators": [2, 3, 5, 13, 51, 201, 303, 403, 505], # "max_features": list(map(lambda x: int(x), # [sqrt(attributes), 2 * sqrt(attributes), 3 * sqrt(attributes), attributes / 2, # attributes / 3, attributes / 4])) # } # # rand_search = RandomizedSearchCV(BaggingClassifier( # base_estimator=LinearSVC(random_state=seed, class_weight="balanced", max_iter=5000, C=1.0, tol=0.0001, dual=True), # random_state=seed, n_jobs=1), param_distributions=parameters, n_jobs=-1, n_iter=3, cv=8, # scoring=make_scorer(f1_score, average="micro", labels=["positive", "negative", "neutral"])) # # now = time() # log(ttt.ctime()) # rand_search.fit(X_train, y_train) # # report(rand_search.cv_results_, 10) log(ttt.ctime()) log(time() - now) # Parameters for RF # log("RF:") # parameters = { # "n_estimators":[103, 201, 305, 403, 666, 1001, 5007, 10001], # "max_depth":[None, 5, 20, 40, 73, 100, 1000, 2000], # "criterion":["gini", "entropy"] # } # # rand_search = RandomizedSearchCV(RandomForestClassifier(random_state=seed,n_jobs=-1),param_distributions=parameters, # n_iter=15,scoring="accuracy", # n_jobs=1,cv=10) # now = time() # log(ttt.ctime()) # rand_search.fit(X_train, y_train) # # report(rand_search.cv_results_, 10) # log(ttt.ctime()) # log(time() - now) # Parameters for XGBoost log("XGB:") parameters = { "n_estimators": [103, 201, 403], "max_depth": [3, 10, 15], "objective": ["multi:softmax", "binary:logistic"], "learning_rate": [0.05, 0.1, 0.15, 0.3] } rand_search = RandomizedSearchCV(XGBoostClassifier(seed=seed), param_distributions=parameters, n_iter=5, scoring="accuracy", n_jobs=-1, cv=8) now = time() log(ttt.ctime()) rand_search.fit(X_train, y_train) report(rand_search.cv_results_, 10) log(ttt.ctime()) log(time() - now) parameters = { "n_estimators": [403, 666, 1000], "max_depth": [40, 50, 90, 100, 200], "subsample": [1.0, 0.6, 0.9], "objective": ["multi:softmax", "binary:logistic"], "learning_rate": [0.1, 0.15, 0.5] } rand_search = RandomizedSearchCV(XGBoostClassifier(seed=seed, ), param_distributions=parameters, n_iter=5, scoring="accuracy", n_jobs=-1, cv=8) now = time() log(ttt.ctime()) rand_search.fit(X_train, y_train) report(rand_search.cv_results_, 10) log(ttt.ctime()) log(time() - now) return # Parameters for VotingClassifier # parameters = { # "weights": [ # [1, 1, 1], # [2, 1, 1], # [2, 2, 1], # [4, 1, 5], # [1, 1, 2], # [5, 1, 2], # [5, 2, 1], # [5, 3, 2], # [6, 2, 1], # [6, 1, 5], # [6, 1, 2], # [7, 1, 6], # [7, 2, 3], # ] # } log("Voting RF XGB NB:") parameters = { "weights": [[1, 1, 1], [2, 1, 1], [1, 1, 2], [4, 1, 5], [3, 1, 3], [3, 1, 4]] } rand_search = GridSearchCV(VotingClassifier( [("randomforest", RandomForestClassifier( n_estimators=403, random_state=seed, max_depth=73, n_jobs=-1)), ("naivebayes", BernoulliNB()), ("xgboost", XGBoostClassifier(n_estimators=103, seed=seed, max_depth=3, objective="multi:softmax"))], voting="soft", n_jobs=1), scoring="accuracy", n_jobs=-1, cv=8, param_grid=parameters) rand_search.fit(X_train, y_train) # report(rand_search.cv_results_, 10) log(ttt.ctime()) log(time() - now)
from xgboost import XGBClassifier as XGBoostClassifier X_train, X_test, y_train, y_test = train_test_split(data_model.drop( columns='label', axis=1), data_model['label'], test_size=0.3) precision, recall, accuracy, f1 = test_classifier(X_train, y_train, X_test, y_test, XGBoostClassifier(seed=seed))
C=1.0, max_iter=200) rand_forest = RandomForestClassifier(n_estimators=403, random_state=666, max_depth=73, n_jobs=-1) bc = BaggingClassifier(base_estimator=logistic, n_estimators=403, n_jobs=-1, random_state=666, max_features=410) ensemble_voting = VotingClassifier([("svm", linearSVM_SVC), ("logistic", logistic), ("rand_forest", rand_forest), ("sgdc", SGDClassifier())], weights=[1, 2, 1, 1]) boost = AdaBoostClassifier(base_estimator=logistic) xgboost = XGBoostClassifier(n_estimators=103, seed=666, max_depth=4, objective="multi:softmax") X_train, X_test, Y_train, Y_test = train_test_split(left, right, test_size=0.1) text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf-svm', ensemble_voting)]) text_clf = text_clf.fit(X_train, Y_train) predicted = text_clf.predict(X_test) print classification_report(Y_test, predicted)
cdTest = dataCleanup() cdTest.init(data_dir + 'tweets.csv', isTestingSet=True) cdTest.buildFeatures() cdTest.cleanData() tTest = cdTest.processedData stTest = stemAndTokenizeData() tTest = stTest.tokenize(tTest) tTest = stTest.stem(tTest) bwTest = buildWordList() bwTest.buildWordListFunction(tTest) bowTest = bagOfWords() bowTest.buildDataModel(tTest, bwTest.wordList, uW2V, isTestingSet=True) dataModelTest = bowTest.dataModel print("Testing Model built!") xgboost = XGBoostClassifier(seed=seed, n_estimators=403, max_depth=10, objective="binary:logistic", learning_rate=0.15) xgboost.fit(dataModel.iloc[:, 1:], dataModel.iloc[:, 0]) print("Training Finished!") predictions = xgboost.predict(dataModelTest.iloc[:, 1:]) results = pd.DataFrame([], columns=["Id", "Category"]) results["Id"] = dataModelTest["original_id"].astype("int64") results["Category"] = predictions results.to_csv("results.csv", index=False) print("Results have been saved to file!!")
import pandas as pd from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from xgboost import XGBoostClassifier from sklearn.metrics import accuracy_score data = pd.read_csv('Loan payments data.csv') print(data.head()) X = data.iloc['Principal','terms','age','education','Gender'] Y=data['loan_status'] gender_label = LabelEncoder() gender_label.fit(X.Gender) X['Gender_Labelled']=gender_label.transform(X.Gender) Ed_Label = LabelEncoder() Ed_Label.fit(X.education) X['Ed_Labelled'] = Ed_Label.transform(X.education) Y_label = LabelEncoder Y_label.fit(Y) Y_labelled = Y_label.transform(Y) X_Labelled = X.iloc('Principal','terms','age','Ed_Labelled','Gender_Labelled') x_train,x_test,y_train,y_test = train_test_split(X_Labelled,Y_labelled) model = XGBoostClassifier() model.fit(x_train,y_train) y_predict = model.predict(x_test) print(accuracy_score(y_test,y_predict))
def best_fit(X_train, y_train): log("") seed = 666 import time as ttt attributes = len(X_train.columns) examples = len(X_train) now = time() log(ttt.ctime()) log(ttt.ctime()) log(time() - now) log("XGB:") parameters = { "n_estimators": [103, 201, 403], "max_depth": [3, 10, 15], "objective": ["multi:softmax", "binary:logistic"], "learning_rate": [0.05, 0.1, 0.15, 0.3] } rand_search = RandomizedSearchCV(XGBoostClassifier(seed=seed), param_distributions=parameters, n_iter=5, scoring="accuracy", n_jobs=-1, cv=8) now = time() log(ttt.ctime()) rand_search.fit(X_train, y_train) report(rand_search.cv_results_, 10) log(ttt.ctime()) log(time() - now) parameters = { "n_estimators": [403, 666, 1000], "max_depth": [40, 50, 90, 100, 200], "subsample": [1.0, 0.6, 0.9], "objective": ["multi:softmax", "binary:logistic"], "learning_rate": [0.1, 0.15, 0.5] } rand_search = RandomizedSearchCV(XGBoostClassifier(seed=seed, ), param_distributions=parameters, n_iter=5, scoring="accuracy", n_jobs=-1, cv=8) now = time() log(ttt.ctime()) rand_search.fit(X_train, y_train) report(rand_search.cv_results_, 10) log(ttt.ctime()) log(time() - now) return log("Voting RF XGB NB:") parameters = { "weights": [[1, 1, 1], [2, 1, 1], [1, 1, 2], [4, 1, 5], [3, 1, 3], [3, 1, 4]] } rand_search = GridSearchCV(VotingClassifier( [("randomforest", RandomForestClassifier( n_estimators=403, random_state=seed, max_depth=73, n_jobs=-1)), ("naivebayes", BernoulliNB()), ("xgboost", XGBoostClassifier(n_estimators=103, seed=seed, max_depth=3, objective="multi:softmax"))], voting="soft", n_jobs=1), scoring="accuracy", n_jobs=-1, cv=8, param_grid=parameters) rand_search.fit(X_train, y_train) # report(rand_search.cv_results_, 10) log(ttt.ctime()) log(time() - now)