class AdaBoostClassifierImpl(): def __init__(self, base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None): self._hyperparams = { 'base_estimator': base_estimator, 'n_estimators': n_estimators, 'learning_rate': learning_rate, 'algorithm': algorithm, 'random_state': random_state } self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X)
class AdaBoostClassifierImpl(): def __init__(self, base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None): if isinstance(base_estimator, lale.operators.Operator): if isinstance(base_estimator, lale.operators.IndividualOp): base_estimator = base_estimator._impl_instance()._wrapped_model else: raise ValueError("If base_estimator is a Lale operator, it needs to be an individual operator. ") self._hyperparams = { 'base_estimator': base_estimator, 'n_estimators': n_estimators, 'learning_rate': learning_rate, 'algorithm': algorithm, 'random_state': random_state} self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X)
def determined_train_and_predict(train_datas, train_lables, test_ids, test_datas): class_fier = AdaBoostClassifier(RandomForestClassifier(n_estimators=300), algorithm="SAMME", n_estimators=400) # class_fier = RandomForestClassifier(n_estimators=300) class_fier.fit(train_datas, train_lables) predict_lables = class_fier.predict(test_datas) result_dic = {} result_dic['Id'] = test_ids result_dic['Response'] = predict_lables out_file_content = pd.DataFrame(result_dic) out_file_content.to_csv('sample3.csv', index=False)
def AB(pth): train_desc=np.load(pth+'/training_features.npy') nbr_occurences = np.sum( (train_desc > 0) * 1, axis = 0) idf = np.array(np.log((1.0*len(image_paths)+1) / (1.0*nbr_occurences + 1)), 'float32') # Scaling the words stdSlr = StandardScaler().fit(train_desc) train_desc = stdSlr.transform(train_desc) modelAB=AdaBoostClassifier(n_estimators=100) modelAB.fit(train_desc,np.array(train_labels)) joblib.dump((modelAB, img_classes, stdSlr), pth+"/ab-bof.pkl", compress=3) test(pth, "ab-")
def classify(X,y,cv): #clf = DecisionTreeClassifier(criterion='entropy',min_samples_split=10,random_state=5) #clf = RandomForestClassifier(n_estimators=1000) clf = AdaBoostClassifier() #clf = ExtraTreesClassifier() score = cross_val_score(clf, X, y, cv=cv) print '%s-fold cross validation accuracy: %s' % (cv,sum(score)/score.shape[0]) clf = clf.fit(X,y) #print 'Feature Importances' #print clf.feature_importances_ #X = clf.transform(X,threshold=.3) preds = clf.predict(X) print 'predictions counter' print Counter(clf.predict(X)) fp=0 tp=0 fn=0 tn=0 for a in range(len(y)): if y[a]==preds[a]: if preds[a]==0: tn+=1 elif preds[a]==1: tp+=1 elif preds[a]==1:fp+=1 elif preds[a]==0:fn+=1 print 'correct positives:', tp print 'correct negatives:', tn print 'false positives:', fp print 'false negatives:', fn print 'precision:',float(tp)/(tp+fp) print 'recall (tp)/(tp+fn):',float(tp)/(tp+fn) print 'false positive rate (fp)/(fp+tn):', float(fp)/(fp+tn) print 'false positive rate2 (fp)/(fp+tp):', float(fp)/(fp+tp) print 'prediction accuracy: %s%s\n' % (100*float(tp+tn)/(tp+tn+fp+fn),'%') return clf
test_size=0.2, random_state=42) #:# preprocessing transform_pipeline = Pipeline([('scaler', StandardScaler())]) X_train = pd.DataFrame(transform_pipeline.fit_transform(X_train), columns=X_train.columns) #:# model params = {'learning_rate': 0.5, 'n_estimators': 300} classifier = AdaBoostClassifier(**params) classifier.fit(X_train, y_train) #:# hash #:# e595f5d5683f3e3692608020cd5bde18 md5 = hashlib.md5(str(classifier).encode('utf-8')).hexdigest() print(f'md5: {md5}') #:# audit y_pred = classifier.predict(transform_pipeline.transform(X_test)) y_pred_proba = classifier.predict_proba( transform_pipeline.transform(X_test))[:, 1] tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() print(f'acc: {accuracy_score(y_test, y_pred)}') print(f'auc: {roc_auc_score(y_test, y_pred_proba)}')
def result(): if request.method == 'POST': path = request.files.get('myFile') df = pd.read_csv(path, encoding="ISO-8859-1") filename = request.form['filename'] str1 = request.form['feature'] str2 = request.form['label'] if str1 in list(df) and str2 in list(df): y = df[str2] X = df[str1] else: return render_template('nameError.html') x = [] for subject in X: result = re.sub(r"http\S+", "", subject) replaced = re.sub(r'[^a-zA-Z0-9 ]+', '', result) x.append(replaced) X = pd.Series(x) X = X.str.lower() """ texts = [] for doc in X: doc = nlp(doc, disable=['parser', 'ner']) tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-'] tokens = [tok for tok in tokens if tok not in stopwords] tokens = ' '.join(tokens) texts.append(tokens) X = pd.Series(texts) """ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) tfidfvect = TfidfVectorizer(ngram_range=(1, 1)) X_train_tfidf = tfidfvect.fit_transform(X_train) start = time() clf1 = LinearSVC() clf1.fit(X_train_tfidf, y_train) pred_SVC = clf1.predict(tfidfvect.transform(X_test)) a1 = accuracy_score(y_test, pred_SVC) end = time() print("accuracy SVC: {} and time: {} s".format(a1, (end - start))) start = time() clf2 = LogisticRegression(n_jobs=-1, multi_class='multinomial', solver='newton-cg') clf2.fit(X_train_tfidf, y_train) pred_LR = clf2.predict(tfidfvect.transform(X_test)) a2 = accuracy_score(y_test, pred_LR) end = time() print("accuracy LR: {} and time: {}".format(a2, (end - start))) start = time() clf3 = RandomForestClassifier(n_jobs=-1) clf3.fit(X_train_tfidf, y_train) pred = clf3.predict(tfidfvect.transform(X_test)) a3 = accuracy_score(y_test, pred) end = time() print("accuracy RFC: {} and time: {}".format(a3, (end - start))) start = time() clf4 = MultinomialNB() clf4.fit(X_train_tfidf, y_train) pred = clf4.predict(tfidfvect.transform(X_test)) a4 = accuracy_score(y_test, pred) end = time() print("accuracy MNB: {} and time: {}".format(a4, (end - start))) start = time() clf5 = GaussianNB() clf5.fit(X_train_tfidf.toarray(), y_train) pred = clf5.predict(tfidfvect.transform(X_test).toarray()) a5 = accuracy_score(y_test, pred) end = time() print("accuracy GNB: {} and time: {}".format(a5, (end - start))) start = time() clf6 = LogisticRegressionCV(n_jobs=-1) clf6.fit(X_train_tfidf, y_train) pred_LR = clf6.predict(tfidfvect.transform(X_test)) a6 = accuracy_score(y_test, pred_LR) end = time() print("accuracy LRCV: {} and time: {}".format(a6, (end - start))) start = time() clf7 = AdaBoostClassifier() clf7.fit(X_train_tfidf, y_train) pred_LR = clf7.predict(tfidfvect.transform(X_test)) a7 = accuracy_score(y_test, pred_LR) end = time() print("accuracy ABC: {} and time: {}".format(a7, (end - start))) start = time() clf8 = BernoulliNB() clf8.fit(X_train_tfidf.toarray(), y_train) pred = clf8.predict(tfidfvect.transform(X_test).toarray()) a8 = accuracy_score(y_test, pred) end = time() print("accuracy BNB: {} and time: {}".format(a8, (end - start))) start = time() clf9 = Perceptron(n_jobs=-1) clf9.fit(X_train_tfidf.toarray(), y_train) pred = clf9.predict(tfidfvect.transform(X_test).toarray()) a9 = accuracy_score(y_test, pred) end = time() print("accuracy Per: {} and time: {}".format(a9, (end - start))) start = time() clf10 = RidgeClassifierCV() clf10.fit(X_train_tfidf.toarray(), y_train) pred = clf10.predict(tfidfvect.transform(X_test).toarray()) a10 = accuracy_score(y_test, pred) end = time() print("accuracy RidCV: {} and time: {}".format(a10, (end - start))) start = time() clf11 = SGDClassifier(n_jobs=-1) clf11.fit(X_train_tfidf.toarray(), y_train) pred = clf11.predict(tfidfvect.transform(X_test).toarray()) a11 = accuracy_score(y_test, pred) end = time() print("accuracy SGDC: {} and time: {}".format(a11, (end - start))) start = time() clf12 = SGDClassifier(n_jobs=-1) clf12.fit(X_train_tfidf.toarray(), y_train) pred = clf12.predict(tfidfvect.transform(X_test).toarray()) a12 = accuracy_score(y_test, pred) end = time() print("accuracy XGBC: {} and time: {}".format(a12, (end - start))) acu_list = [a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12] max_list = max(acu_list) if max_list == a1: pickle.dump(clf1, open(filename + '_model', 'wb')) elif max_list == a2: pickle.dump(clf2, open(filename + '_model', 'wb')) elif max_list == a3: pickle.dump(clf3, open(filename + '_model', 'wb')) elif max_list == a4: pickle.dump(clf4, open(filename + '_model', 'wb')) elif max_list == a5: pickle.dump(clf5, open(filename + '_model', 'wb')) elif max_list == a6: pickle.dump(clf6, open(filename + '_model', 'wb')) elif max_list == a7: pickle.dump(clf7, open(filename + '_model', 'wb')) elif max_list == a8: pickle.dump(clf8, open(filename + '_model', 'wb')) elif max_list == a9: pickle.dump(clf9, open(filename + '_model', 'wb')) elif max_list == a10: pickle.dump(clf10, open(filename + '_model', 'wb')) elif max_list == a11: pickle.dump(clf11, open(filename + '_model', 'wb')) elif max_list == a12: pickle.dump(clf12, open(filename + '_model', 'wb')) pickle.dump(tfidfvect, open(filename + '_tfidfVect', 'wb')) return render_template("result.html", ac1=a1, ac2=a2, ac3=a3, ac4=a4, ac5=a5, ac6=a6, ac7=a7, ac8=a8, ac9=a9, ac10=a10, ac11=a11, ac12=a12)
# Train and test random forests. # load_path = "../homesite_data/resources/oversampled_normalized_data_ratio_2.5.bin" load_path = "../homesite_data/resources/oversampled_normalized_data_ratio_2.bin" homesite = Data() homesite.load_sliptted_data(load_path) del homesite.test_x # Deleted to save memory. clf_ann = NeuralNetwork(path = "../homesite_data/ann_weights.bin", lr = 0.00005, \ lamb = 0) train_output_ann = clf_ann.get_hidden_output(homesite.train_x) validation_output_ann = clf_ann.get_hidden_output(homesite.validation_x) # train_output_ann = np.hstack((train_output_ann, homesite.train_x)) # validation_output_ann = np.hstack((validation_output_ann, homesite.validation_x)) for c in range(1, 10): # Train classifier. print "Training classifier." clf = AdaBoostClassifier(n_estimators=1 + 100 * c) clf.fit(train_output_ann, homesite.train_y) # Test classifier. print 'Testing classifier.' predicted_labels = clf.predict_proba(validation_output_ann)[:, 1] # Show final results. results = confusion_matrix(homesite.validation_y, np.round(predicted_labels)) accuracy, precision, recall = compute_performance_metrics(results) auc = compute_auc(homesite.validation_y, predicted_labels)
all_tpr = [] cvs = StratifiedKFold(homesite.train_y, n_folds=5) clf = AdaBoostClassifier(n_estimators=c, random_state=0) # Train classifier. print "\nTraining classifier param %d" % c for i, (train, test) in enumerate(cvs): sm = OverSampler(verbose=False, ratio=2.5) train_oversampled_x, train_oversampled_train_y = sm.fit_transform( homesite.train_x[train], homesite.train_y[train]) probas_ = clf.fit(train_oversampled_x, train_oversampled_train_y).predict_proba( homesite.train_x[test]) fpr, tpr, thresholds = roc_curve(homesite.train_y[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = compute_auc(homesite.train_y[test], probas_[:, 1]) fold_cm = confusion_matrix(homesite.train_y[test], np.round(probas_)[:, 1]) confusion_matrix_history = np.dstack( (confusion_matrix_history, fold_cm)) accuracy, precision, recall = compute_performance_metrics(fold_cm)
l_train = l_train.join(pd.get_dummies(l_train['Transmission'])) l_train = l_train.join(pd.get_dummies(l_train['WheelType'])) l_train = l_train.join(pd.get_dummies(l_train['Size'])) l_train = l_train.drop(['Auction','Transmission','WheelType','Size'],axis=1) l_train = l_train.dropna() data = l_train.drop('IsBadBuy',axis=1) target = l_train['IsBadBuy'] x_train, x_test, y_train, y_test = cross_validation.train_test_split(data, target, test_size=.3) # AdaBoost Runs the best model = AdaBoostClassifier() clf = model.fit(x_train, y_train) scores = clf.score(x_train,y_train) print metrics.classification_report(y_train, clf.predict(x_train)) print metrics.classification_report(y_test, clf.predict(x_test)) y_pred = clf.predict(x_test) metrics.roc_auc_score(y_train,clf.predict(x_train)) metrics.roc_auc_score(y_test,clf.predict(x_test)) # Create a submission #submission = pd.DataFrame({ 'RefId' : l_test.RefId, 'prediction' : y_pred }) #submission.to_csv('/users/alexandersedgwick/desktop/submission.csv')
# Print Confusion Matrix metrics.confusion_matrix(etclf.predict(x_test), y_test) from sklearn.ensemble.forest import RandomForestClassifier rdclf = RandomForestClassifier(n_estimators=20, max_depth=10) rdclf.fit(x_train, y_train) metrics.confusion_matrix(rdclf.predict(x_test), y_test) from sklearn.ensemble.weight_boosting import AdaBoostClassifier adaclf = AdaBoostClassifier(n_estimators=20) adaclf.fit(x_train, y_train) metrics.confusion_matrix(adaclf.predict(x_test), y_test) metrics.confusion_matrix(etclf.predict(x_test), y_test) metrics.confusion_matrix(rdclf.predict(x_test), y_test) metrics.confusion_matrix(adaclf.predict(x_test), y_test) #The base random forest model seems to do best here. import time
def init_model(input_data, target_data): model = AdaBoostClassifier(n_estimators=285, learning_rate=0.19, algorithm='SAMME.R') model.fit(input_data, target_data) return model