def determined_train_and_predict(train_datas, train_lables, test_ids, test_datas): class_fier = AdaBoostClassifier(RandomForestClassifier(n_estimators=300), algorithm="SAMME", n_estimators=400) # class_fier = RandomForestClassifier(n_estimators=300) class_fier.fit(train_datas, train_lables) predict_lables = class_fier.predict(test_datas) result_dic = {} result_dic['Id'] = test_ids result_dic['Response'] = predict_lables out_file_content = pd.DataFrame(result_dic) out_file_content.to_csv('sample3.csv', index=False)
def AB(pth): train_desc=np.load(pth+'/training_features.npy') nbr_occurences = np.sum( (train_desc > 0) * 1, axis = 0) idf = np.array(np.log((1.0*len(image_paths)+1) / (1.0*nbr_occurences + 1)), 'float32') # Scaling the words stdSlr = StandardScaler().fit(train_desc) train_desc = stdSlr.transform(train_desc) modelAB=AdaBoostClassifier(n_estimators=100) modelAB.fit(train_desc,np.array(train_labels)) joblib.dump((modelAB, img_classes, stdSlr), pth+"/ab-bof.pkl", compress=3) test(pth, "ab-")
def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
def ada(): pipeline = Pipeline([('count_vectorizer', CountVectorizer(binary=True, ngram_range=(1, 2), max_features=15000, stop_words=stopwords)), ('clf', AdaBoostClassifier())]) train_report(pipeline)
class AdaBoostClassifierImpl(): def __init__(self, base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None): self._hyperparams = { 'base_estimator': base_estimator, 'n_estimators': n_estimators, 'learning_rate': learning_rate, 'algorithm': algorithm, 'random_state': random_state } self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X)
class AdaBoostClassifierImpl(): def __init__(self, base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None): if isinstance(base_estimator, lale.operators.Operator): if isinstance(base_estimator, lale.operators.IndividualOp): base_estimator = base_estimator._impl_instance()._wrapped_model else: raise ValueError("If base_estimator is a Lale operator, it needs to be an individual operator. ") self._hyperparams = { 'base_estimator': base_estimator, 'n_estimators': n_estimators, 'learning_rate': learning_rate, 'algorithm': algorithm, 'random_state': random_state} self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X)
def check_classifiers(n_samples=10000): """ This function is not tested by default, it should be called manually """ testX, testY = generate_sample(n_samples, 10, 0.6) trainX, trainY = generate_sample(n_samples, 10, 0.6) uniform_features = ['column0'] ada = AdaBoostClassifier(n_estimators=50) ideal_bayes = GaussianNB() uBoost_SAMME = uBoostClassifier( uniform_features=uniform_features, uniform_label=1, n_neighbors=50, efficiency_steps=5, n_estimators=50, algorithm="SAMME") uBoost_SAMME_R = uBoostClassifier( uniform_features=uniform_features, uniform_label=1, n_neighbors=50, efficiency_steps=5, n_estimators=50, algorithm="SAMME.R") uBoost_SAMME_R_threaded = uBoostClassifier( uniform_features=uniform_features, uniform_label=1, n_neighbors=50, efficiency_steps=5, n_estimators=50, n_threads=3, subsample=0.9, algorithm="SAMME.R") clf_dict = OrderedDict({ "Ada": ada, "uBOOST": uBoost_SAMME, "uBOOST.R": uBoost_SAMME_R, "uBOOST.R2": uBoost_SAMME_R_threaded }) cvms = {} for clf_name, clf in clf_dict.items(): clf.fit(trainX, trainY) p = clf.predict_proba(testX) metric = KnnBasedCvM(uniform_features=uniform_features) metric.fit(testX, testY) cvms[clf_name] = metric(testY, p, sample_weight=np.ones(len(testY))) assert cvms['uBOOST'] < cvms['ada'] print(cvms)
def classify(X,y,cv): #clf = DecisionTreeClassifier(criterion='entropy',min_samples_split=10,random_state=5) #clf = RandomForestClassifier(n_estimators=1000) clf = AdaBoostClassifier() #clf = ExtraTreesClassifier() score = cross_val_score(clf, X, y, cv=cv) print '%s-fold cross validation accuracy: %s' % (cv,sum(score)/score.shape[0]) clf = clf.fit(X,y) #print 'Feature Importances' #print clf.feature_importances_ #X = clf.transform(X,threshold=.3) preds = clf.predict(X) print 'predictions counter' print Counter(clf.predict(X)) fp=0 tp=0 fn=0 tn=0 for a in range(len(y)): if y[a]==preds[a]: if preds[a]==0: tn+=1 elif preds[a]==1: tp+=1 elif preds[a]==1:fp+=1 elif preds[a]==0:fn+=1 print 'correct positives:', tp print 'correct negatives:', tn print 'false positives:', fp print 'false negatives:', fn print 'precision:',float(tp)/(tp+fp) print 'recall (tp)/(tp+fn):',float(tp)/(tp+fn) print 'false positive rate (fp)/(fp+tn):', float(fp)/(fp+tn) print 'false positive rate2 (fp)/(fp+tp):', float(fp)/(fp+tp) print 'prediction accuracy: %s%s\n' % (100*float(tp+tn)/(tp+tn+fp+fn),'%') return clf
def __init__(self, base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None): if isinstance(base_estimator, lale.operators.Operator): if isinstance(base_estimator, lale.operators.IndividualOp): base_estimator = base_estimator._impl_instance()._wrapped_model else: raise ValueError("If base_estimator is a Lale operator, it needs to be an individual operator. ") self._hyperparams = { 'base_estimator': base_estimator, 'n_estimators': n_estimators, 'learning_rate': learning_rate, 'algorithm': algorithm, 'random_state': random_state} self._wrapped_model = SKLModel(**self._hyperparams)
def __init__(self, base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None): self._hyperparams = { 'base_estimator': base_estimator, 'n_estimators': n_estimators, 'learning_rate': learning_rate, 'algorithm': algorithm, 'random_state': random_state } self._wrapped_model = SKLModel(**self._hyperparams)
def get_feature_ranking(X_train, y_train): print("feature ranking running....-> LogisticRegression") model1 = LogisticRegression(max_iter=500) rfe = RFECV(estimator=model1, step=1, cv=StratifiedKFold(2), scoring='accuracy') rfe = rfe.fit(X_train, y_train) logr_ranking = [] for x, d in zip(rfe.ranking_, X_train.columns): logr_ranking.append([d, x]) logr_ranking = pd.DataFrame(logr_ranking, columns=['features1', 'logr']) logr_ranking.sort_values('features1', inplace=True) print("feature ranking running....-> GradientBoostingClassifier") model2 = GradientBoostingClassifier() rfe = RFECV(estimator=model2, step=1, cv=StratifiedKFold(2), scoring='accuracy') rfe = rfe.fit(X_train, y_train) gboost_ranking = [] for x, d in zip(rfe.ranking_, X_train.columns): gboost_ranking.append([d, x]) gboost_ranking = pd.DataFrame(gboost_ranking, columns=['features2', 'gboost']) gboost_ranking.sort_values('features2', inplace=True) print("feature ranking running....-> AdaBoostClassifier") model3 = AdaBoostClassifier() rfe = RFECV(estimator=model3, step=1, cv=StratifiedKFold(2), scoring='accuracy') rfe = rfe.fit(X_train, y_train) adaboost_ranking = [] for x, d in zip(rfe.ranking_, X_train.columns): adaboost_ranking.append([d, x]) adaboost_ranking = pd.DataFrame(adaboost_ranking, columns=['features3', 'adaboost']) adaboost_ranking.sort_values('features3', inplace=True) feature_sum = logr_ranking['logr'] + gboost_ranking[ 'gboost'] + adaboost_ranking['adaboost'] df_ranked = pd.concat([logr_ranking['features1'], feature_sum], axis=1) df_ranked.sort_values(0, inplace=True) return df_ranked
def check_classifiers(n_samples=10000, output_name_pattern=None): """ This function is not tested by default, it should be called manually """ testX, testY = generate_sample(n_samples, 10, 0.6) trainX, trainY = generate_sample(n_samples, 10, 0.6) uniform_variables = ['column0'] ada = AdaBoostClassifier(n_estimators=50) ideal_bayes = HidingClassifier(train_variables=trainX.columns[1:], base_estimator=GaussianNB()) uBoost_SAMME = uBoostClassifier(uniform_variables=uniform_variables, n_neighbors=50, efficiency_steps=5, n_estimators=50, algorithm="SAMME") uBoost_SAMME_R = uBoostClassifier(uniform_variables=uniform_variables, n_neighbors=50, efficiency_steps=5, n_estimators=50, algorithm="SAMME.R") clf_dict = ClassifiersDict({ "Ada": ada, "Ideal": ideal_bayes, "uBOOST": uBoost_SAMME, "uBOOST.R": uBoost_SAMME_R }) clf_dict.fit(trainX, trainY) predictions = Predictions(clf_dict, testX, testY) # predictions.print_mse(uniform_variables, in_html=False) print(predictions.compute_metrics()) predictions.sde_curves(uniform_variables) if output_name_pattern is not None: pl.savefig(output_name_pattern % "mse_curves", bbox="tight") _ = pl.figure() predictions.learning_curves() if output_name_pattern is not None: pl.savefig(output_name_pattern % "learning_curves", bbox="tight") predictions.efficiency(uniform_variables) if output_name_pattern is not None: pl.savefig(output_name_pattern % "efficiency_curves", bbox="tight")
def defaultModels(df_xmat, df_ymat_cat): #### representitive common classifiers in sklearn #### classifiers = [ GaussianNB(), LogisticRegression(max_iter=500), DecisionTreeClassifier(), KNeighborsClassifier(), SVC(kernel='rbf'), AdaBoostClassifier(), BaggingClassifier(), ExtraTreesClassifier(), GradientBoostingClassifier(), RandomForestClassifier(), ] cv = StratifiedKFold(n_splits=10) res = [] for clf in classifiers: print('processing...' + str(clf)[:10]) metrics_cv = [] for train_index, test_index in cv.split(df_xmat.values, df_ymat_cat): X_train = df_xmat.iloc[train_index, :].values X_test = df_xmat.iloc[test_index, :].values y_train = [df_ymat_cat[i] for i in train_index] y_test = [df_ymat_cat[i] for i in test_index] clf.fit(X_train, y_train) metrics_cv.append(clf.score(X_test, y_test)) res.append([ str(clf)[:10], np.array(metrics_cv).mean(axis=0), np.array(metrics_cv).std(axis=0) ]) return res
def __init__(self): self.random_rate=33 clf1=SVC(C=1.0,random_state=33) clf2=XGBClassifier(n_estimators=220,learning_rate=0.2,min_child_weight=2.3) clf3=RandomForestClassifier(n_estimators=80,random_state=330,n_jobs=-1) clf4=BaggingClassifier(n_estimators=40,random_state=101) clf5=AdaBoostClassifier(n_estimators=70,learning_rate=1.5,random_state=33) clf6=GradientBoostingClassifier(n_estimators=250,learning_rate=0.23,random_state=33) clf7=XGBClassifier(n_estimators=100,learning_rate=0.12,min_child_weight=1) base_model=[ ['svc',clf1], ['xgbc',clf2], ['rfc',clf3], ['bgc',clf4], ['adbc',clf5], ['gdbc',clf6] ] self.base_models=base_model self.XGB=clf7
def init_model(input_data, target_data): model = AdaBoostClassifier(n_estimators=285, learning_rate=0.19, algorithm='SAMME.R') model.fit(input_data, target_data) return model
print("LEARNING STEP") #default classifier = "not_init" if alg == 0: classifier = DecisionTreeClassifier(max_depth=tree_depth) if alg == 1: classifier = RandomForestClassifier(n_estimators=random_forest_size, random_state=seed, n_jobs=10) if alg == 2: classifier = create_ensemble(seed) if alg == 3: classifier = AdaBoostClassifier(DecisionTreeClassifier(), n_estimators=boosting_size, random_state=seed) if alg == 4: scaler = StandardScaler() svr = SVR(kernel='rbf', cache_size=4000, C=1e3, gamma=0.0001, max_iter=200000, epsilon=0.0001) classifier = Pipeline([('standardize', scaler), ('svr', svr)]) if alg == 5: classifier = GaussianNB() if classifier == "not_init": print("Classifier not init, exit")
class SkClassifier(MultiClassifier): type_map = dict( MultiClassifier.type_map, c=float, gamma=float, cache=int, n_estimators=int, n_neighbors=int, radius=float, # probability=str_to_bool, # TODO # class_weights # TODO ) base_param_grid = {'svc__C': np.logspace(-2, 3, 5)} classifiers = { 'svm': { 'build': lambda self: SVC(kernel='linear', C=self.c, probability=self.probability, cache_size=self.cache, class_weight=SVM_CLASS_WEIGHTS if SVM_CLASS_WEIGHTS else None), 'param_grid': dict(base_param_grid), 'test_params': { 'probability': False }, 'roc_params': { 'probability': True }, }, 'svm-rbf': { 'build': lambda self: SVC(kernel='rbf', C=self.c, probability=self.probability, cache_size=self.cache, gamma=self.gamma, class_weight=SVM_CLASS_WEIGHTS if SVM_CLASS_WEIGHTS else None), 'param_grid': dict(base_param_grid, svc__gamma=np.logspace(-9, 3, 5)), 'test_params': { 'probability': False }, 'roc_params': { 'probability': True }, }, 'mlp': { 'build': lambda self: MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=42), 'param_grid': dict(), }, 'knn': { 'build': lambda self: KNeighborsClassifier(n_neighbors=self.n_neighbors, n_jobs=self.n_jobs), 'param_grid': dict(), }, 'rnn': { 'build': lambda self: RadiusNeighborsClassifier( radius=self.radius, n_jobs=self.n_jobs, outlier_label=0), 'param_grid': dict(), }, 'ada': { 'build': lambda self: AdaBoostClassifier(), 'param_grid': dict(), }, 'ada-svm': { 'build': lambda self: AdaBoostClassifier(base_estimator=SVC( probability=True, kernel='rbf', C=self.c, gamma=self.gamma, cache_size=self.cache)), 'param_grid': dict(), }, 'ada-sgd': { 'build': lambda self: AdaBoostClassifier( base_estimator=SGDClassifier(loss='hinge'), algorithm='SAMME'), 'param_grid': dict(), }, 'rf': { 'build': lambda self: RandomForestClassifier(n_estimators=self.n_estimators, n_jobs=self.n_jobs), 'param_grid': dict(), }, 'et': { 'build': lambda self: ExtraTreesClassifier(n_estimators=self.n_estimators, n_jobs=self.n_jobs), 'param_grid': dict(), }, 'gnb': { 'build': lambda self: GaussianNB(), 'param_grid': dict(), }, 'bnb': { 'build': lambda self: BernoulliNB(), 'param_grid': dict(), }, } def __init__(self, classifier_name='svm', c=1000, gamma=0.02, cache=2000, n_estimators=200, n_neighbors=16, radius=1.0, probability=True, **kwargs): super().__init__(classifier_name, **kwargs) logger.info( 'Initializing Scikit-learn classifier {}'.format(classifier_name)) self.c = c self.gamma = gamma self.cache = cache self.probability = probability self.n_estimators = n_estimators self.n_neighbors = n_neighbors self.radius = radius def load_model(self, model_dir): super().load_model(model_dir=model_dir) self.clf = joblib.load(os.path.join(model_dir, 'clf.pkl')) def save_model(self, output_dir): super().save_model(output_dir=output_dir) joblib.dump(self.clf, os.path.join(output_dir, 'clf.pkl')) def _build_classifier(self, *args, **kwargs): return self.classifier_dict['build'](self) def _get_param_grid(self): return self.classifier_dict['param_grid'] def _get_test_params(self): return self.classifier_dict.get('test_params', {}) def _get_cv_params(self): return { **self._get_test_params(), **self.classifier_dict.get('cv_params', {}) } def _get_roc_params(self): return self.classifier_dict.get('roc_params', {})
from sklearn.svm.classes import SVC import os import warnings from sklearn.preprocessing import StandardScaler import pandas as pd from sklearn.preprocessing import LabelEncoder from xgboost import XGBClassifier import pickle from sklearn.model_selection import train_test_split import shutil from statistics import mean warnings.filterwarnings('ignore') classifiers = [ AdaBoostClassifier(), BaggingClassifier(), BernoulliNB(), CalibratedClassifierCV(), DecisionTreeClassifier(), ExtraTreeClassifier(), ExtraTreesClassifier(), GaussianNB(), GaussianProcessClassifier(), GradientBoostingClassifier(), KNeighborsClassifier(), LabelPropagation(), LabelSpreading(), LinearDiscriminantAnalysis(), LogisticRegression(), LogisticRegressionCV(),
# NOTE: Adjust Trainingset / Testset division ratio: divratio = 0.3 # Normalization (L1 & L2): # NOTE: Change 'normtype' value to 'l1' / 'l2' to change normalization type: normtype = 'l2'#'l1' # model_selection is used for manually enabling the individual models. # NOTE: Setting boolean value, eanbles/disables model. model_selection = { 'ExtraTrees': ( True, ExtraTreesClassifier(n_estimators='warn', criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None) ), 'RandomForest': ( True, RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1) ), 'AdaBoost': ( True, AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None) ), 'DecisionTree': ( True, DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=5, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=False) ), 'GradientBoosting': (True, GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto', validation_fraction=0.1, n_iter_no_change=None, tol=0.0001) ), 'BernoulliNB': (True, BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None) ), 'BaggingClassifier': (True, BaggingClassifier(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0) ), 'NearestNeighbors': (True, KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None) ), # (n_neighbors=4) ), 'LogisticRegressionCV': (True, LogisticRegressionCV(Cs=10, fit_intercept=True, cv='warn', dual=False, penalty='l2', scoring=None, solver='lbfgs', tol=0.0001, max_iter=100, class_weight=None, n_jobs=None, verbose=0, refit=True, intercept_scaling=1.0, multi_class='warn', random_state=None, l1_ratios=None) ), 'LDA': (True, LinearDiscriminantAnalysis(solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001) ), 'LogisticRegression': (True, LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='warn', max_iter=100, multi_class='warn', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None) ), 'CalibratedClassifierCV': (True, CalibratedClassifierCV(base_estimator=None, method='sigmoid', cv='warn') ), 'LinearSVC': (True, LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000) ), 'LinearSVM': ( True, SVC(kernel='linear', C=0.025) ), # (C=0.01, penalty='l1', dual=False) ), 'RBF_SVM': (True, SVC(gamma='auto') ),#gamma=2, C=1) ), # 'Nu_SVM': (True, NuSVC(gamma='auto') ), 'GaussianProcess': (False, GaussianProcessClassifier() ), #(1.0 * RBF(1.0)) ), 'NeuralNet': (True, MLPClassifier(alpha=1, max_iter=1000) ),
from sklearn.tree import DecisionTreeClassifier # classification models classifiers = { 'K-Nearest Neighbors (Braycurtis norm)': KNeighborsClassifier(n_neighbors=3, algorithm='auto', metric='braycurtis'), 'Random Forest': RandomForestClassifier(n_estimators=80, n_jobs=1), 'SVM': SVC(gamma=2, C=1), 'Linear Support Vector Machine': SVC(kernel="linear", C=0.025), 'Decision Tree': DecisionTreeClassifier(max_depth=5), 'Ada Boost': AdaBoostClassifier(n_estimators=80, learning_rate=0.4), 'Naive Bayes': GaussianNB(), } vc = VotingClassifier(estimators=list(classifiers.items()), voting='hard') def evaluate_model(model_name, model, x, y): """Evaluate model accuracy via cross validation.""" print('%s:' % model_name) model.fit(x, y.values.ravel()) print('CV f1_micro (not reusing data): %s' % np.mean( cross_val_score(model, x, y.values.ravel(), cv=5, scoring='f1_micro'))) def predict(x, y, signal_matrix, verbose=1):
from sklearn.cluster.bicluster import SpectralBiclustering from sklearn.cluster.spectral import SpectralClustering from sklearn.cluster.bicluster import SpectralCoclustering from sklearn.manifold.spectral_embedding_ import SpectralEmbedding from sklearn.preprocessing.data import StandardScaler from sklearn.manifold.t_sne import TSNE from sklearn.linear_model.theil_sen import TheilSenRegressor from sklearn.mixture.dpgmm import VBGMM from sklearn.feature_selection.variance_threshold import VarianceThreshold import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) clf_dict = {'ARDRegression':ARDRegression(), 'AdaBoostClassifier':AdaBoostClassifier(), 'AdaBoostRegressor':AdaBoostRegressor(), 'AdditiveChi2Sampler':AdditiveChi2Sampler(), 'AffinityPropagation':AffinityPropagation(), 'AgglomerativeClustering':AgglomerativeClustering(), 'BaggingClassifier':BaggingClassifier(), 'BaggingRegressor':BaggingRegressor(), 'BayesianGaussianMixture':BayesianGaussianMixture(), 'BayesianRidge':BayesianRidge(), 'BernoulliNB':BernoulliNB(), 'BernoulliRBM':BernoulliRBM(), 'Binarizer':Binarizer(), 'Birch':Birch(), 'CCA':CCA(), 'CalibratedClassifierCV':CalibratedClassifierCV(), 'DBSCAN':DBSCAN(),
g_train = g.iloc[train_ind, :] g_test = g.iloc[test_ind, :] clf = tree.DecisionTreeClassifier(criterion='gini', max_depth=6, min_samples_leaf=3) #################### clf = RandomForestClassifier(criterion='gini', max_depth=6, min_samples_leaf=3, n_estimators=50) #################### clf = AdaBoostClassifier(DecisionTreeClassifier(criterion='gini', max_depth=6, min_samples_leaf=3), n_estimators=200, learning_rate=0.1) #################### clf = neighbors.KNeighborsClassifier(100, weights='uniform') clf = neighbors.KNeighborsClassifier(100, weights='distance') #################### clf = GaussianNB() ############################## t0 = time() param_grid = { 'C': [150, 500, 750, 1000], 'gamma': [0.0005, 0.001, 0.05, .01], } clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid) clf = clf.fit(X_train, y_train)
etclf.fit(x_train, y_train) # Print Confusion Matrix metrics.confusion_matrix(etclf.predict(x_test), y_test) from sklearn.ensemble.forest import RandomForestClassifier rdclf = RandomForestClassifier(n_estimators=20, max_depth=10) rdclf.fit(x_train, y_train) metrics.confusion_matrix(rdclf.predict(x_test), y_test) from sklearn.ensemble.weight_boosting import AdaBoostClassifier adaclf = AdaBoostClassifier(n_estimators=20) adaclf.fit(x_train, y_train) metrics.confusion_matrix(adaclf.predict(x_test), y_test) metrics.confusion_matrix(etclf.predict(x_test), y_test) metrics.confusion_matrix(rdclf.predict(x_test), y_test) metrics.confusion_matrix(adaclf.predict(x_test), y_test) #The base random forest model seems to do best here. import time
# homesite.train_x = homesite.train_x[reduced_range] # homesite.train_y = homesite.train_y[reduced_range] C = [256, 512] for c in C: # Creating classifier. mean_acc = 0.0 mean_recall = 0.0 mean_precision = 0.0 mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] cvs = StratifiedKFold(homesite.train_y, n_folds=5) clf = AdaBoostClassifier(n_estimators=c, random_state=0) # Train classifier. print "\nTraining classifier param %d" % c for i, (train, test) in enumerate(cvs): sm = OverSampler(verbose=False, ratio=2.5) train_oversampled_x, train_oversampled_train_y = sm.fit_transform( homesite.train_x[train], homesite.train_y[train]) probas_ = clf.fit(train_oversampled_x, train_oversampled_train_y).predict_proba( homesite.train_x[test]) fpr, tpr, thresholds = roc_curve(homesite.train_y[test], probas_[:, 1])
for f in field: print("field", f) temp = groups[f].median() for i in range(0, 100945): if (isnull(dataset.loc[i, f])): condition = dataset.loc[i, '_conds'] dataset.loc[i, f] = temp[condition] print("values: ", dataset.loc[i, f], " ; ", temp[condition]) dataset['_heatindexm'].fillna(dataset['_heatindexm'].median(), inplace=True) dataset['_hum'].fillna(dataset['_hum'].median(), inplace=True) dataset['_tempm'].fillna(dataset['_tempm'].median(), inplace=True) dataset['_vism'].fillna(dataset['_vism'].median(), inplace=True) dataset = dataset.values X = dataset[:, 1:len(dataset[0])] Y = dataset[:, 0] X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3) for dept in range(5, 8): for feats in range(5, 8): classifier = AdaBoostClassifier(DecisionTreeClassifier( max_depth=dept, max_features=feats, splitter="best", criterion="entropy"), learning_rate=1.0) classifier.fit(X_train, Y_train) print("depth: ", dept, "features: ", feats) print("Score", classifier.score(X_train, Y_train))
l_train = l_train.join(pd.get_dummies(l_train['Transmission'])) l_train = l_train.join(pd.get_dummies(l_train['WheelType'])) l_train = l_train.join(pd.get_dummies(l_train['Size'])) l_train = l_train.drop(['Auction','Transmission','WheelType','Size'],axis=1) l_train = l_train.dropna() data = l_train.drop('IsBadBuy',axis=1) target = l_train['IsBadBuy'] x_train, x_test, y_train, y_test = cross_validation.train_test_split(data, target, test_size=.3) # AdaBoost Runs the best model = AdaBoostClassifier() clf = model.fit(x_train, y_train) scores = clf.score(x_train,y_train) print metrics.classification_report(y_train, clf.predict(x_train)) print metrics.classification_report(y_test, clf.predict(x_test)) y_pred = clf.predict(x_test) metrics.roc_auc_score(y_train,clf.predict(x_train)) metrics.roc_auc_score(y_test,clf.predict(x_test)) # Create a submission #submission = pd.DataFrame({ 'RefId' : l_test.RefId, 'prediction' : y_pred }) #submission.to_csv('/users/alexandersedgwick/desktop/submission.csv')
def result(): if request.method == 'POST': path = request.files.get('myFile') df = pd.read_csv(path, encoding="ISO-8859-1") filename = request.form['filename'] str1 = request.form['feature'] str2 = request.form['label'] if str1 in list(df) and str2 in list(df): y = df[str2] X = df[str1] else: return render_template('nameError.html') x = [] for subject in X: result = re.sub(r"http\S+", "", subject) replaced = re.sub(r'[^a-zA-Z0-9 ]+', '', result) x.append(replaced) X = pd.Series(x) X = X.str.lower() """ texts = [] for doc in X: doc = nlp(doc, disable=['parser', 'ner']) tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-'] tokens = [tok for tok in tokens if tok not in stopwords] tokens = ' '.join(tokens) texts.append(tokens) X = pd.Series(texts) """ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) tfidfvect = TfidfVectorizer(ngram_range=(1, 1)) X_train_tfidf = tfidfvect.fit_transform(X_train) start = time() clf1 = LinearSVC() clf1.fit(X_train_tfidf, y_train) pred_SVC = clf1.predict(tfidfvect.transform(X_test)) a1 = accuracy_score(y_test, pred_SVC) end = time() print("accuracy SVC: {} and time: {} s".format(a1, (end - start))) start = time() clf2 = LogisticRegression(n_jobs=-1, multi_class='multinomial', solver='newton-cg') clf2.fit(X_train_tfidf, y_train) pred_LR = clf2.predict(tfidfvect.transform(X_test)) a2 = accuracy_score(y_test, pred_LR) end = time() print("accuracy LR: {} and time: {}".format(a2, (end - start))) start = time() clf3 = RandomForestClassifier(n_jobs=-1) clf3.fit(X_train_tfidf, y_train) pred = clf3.predict(tfidfvect.transform(X_test)) a3 = accuracy_score(y_test, pred) end = time() print("accuracy RFC: {} and time: {}".format(a3, (end - start))) start = time() clf4 = MultinomialNB() clf4.fit(X_train_tfidf, y_train) pred = clf4.predict(tfidfvect.transform(X_test)) a4 = accuracy_score(y_test, pred) end = time() print("accuracy MNB: {} and time: {}".format(a4, (end - start))) start = time() clf5 = GaussianNB() clf5.fit(X_train_tfidf.toarray(), y_train) pred = clf5.predict(tfidfvect.transform(X_test).toarray()) a5 = accuracy_score(y_test, pred) end = time() print("accuracy GNB: {} and time: {}".format(a5, (end - start))) start = time() clf6 = LogisticRegressionCV(n_jobs=-1) clf6.fit(X_train_tfidf, y_train) pred_LR = clf6.predict(tfidfvect.transform(X_test)) a6 = accuracy_score(y_test, pred_LR) end = time() print("accuracy LRCV: {} and time: {}".format(a6, (end - start))) start = time() clf7 = AdaBoostClassifier() clf7.fit(X_train_tfidf, y_train) pred_LR = clf7.predict(tfidfvect.transform(X_test)) a7 = accuracy_score(y_test, pred_LR) end = time() print("accuracy ABC: {} and time: {}".format(a7, (end - start))) start = time() clf8 = BernoulliNB() clf8.fit(X_train_tfidf.toarray(), y_train) pred = clf8.predict(tfidfvect.transform(X_test).toarray()) a8 = accuracy_score(y_test, pred) end = time() print("accuracy BNB: {} and time: {}".format(a8, (end - start))) start = time() clf9 = Perceptron(n_jobs=-1) clf9.fit(X_train_tfidf.toarray(), y_train) pred = clf9.predict(tfidfvect.transform(X_test).toarray()) a9 = accuracy_score(y_test, pred) end = time() print("accuracy Per: {} and time: {}".format(a9, (end - start))) start = time() clf10 = RidgeClassifierCV() clf10.fit(X_train_tfidf.toarray(), y_train) pred = clf10.predict(tfidfvect.transform(X_test).toarray()) a10 = accuracy_score(y_test, pred) end = time() print("accuracy RidCV: {} and time: {}".format(a10, (end - start))) start = time() clf11 = SGDClassifier(n_jobs=-1) clf11.fit(X_train_tfidf.toarray(), y_train) pred = clf11.predict(tfidfvect.transform(X_test).toarray()) a11 = accuracy_score(y_test, pred) end = time() print("accuracy SGDC: {} and time: {}".format(a11, (end - start))) start = time() clf12 = SGDClassifier(n_jobs=-1) clf12.fit(X_train_tfidf.toarray(), y_train) pred = clf12.predict(tfidfvect.transform(X_test).toarray()) a12 = accuracy_score(y_test, pred) end = time() print("accuracy XGBC: {} and time: {}".format(a12, (end - start))) acu_list = [a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12] max_list = max(acu_list) if max_list == a1: pickle.dump(clf1, open(filename + '_model', 'wb')) elif max_list == a2: pickle.dump(clf2, open(filename + '_model', 'wb')) elif max_list == a3: pickle.dump(clf3, open(filename + '_model', 'wb')) elif max_list == a4: pickle.dump(clf4, open(filename + '_model', 'wb')) elif max_list == a5: pickle.dump(clf5, open(filename + '_model', 'wb')) elif max_list == a6: pickle.dump(clf6, open(filename + '_model', 'wb')) elif max_list == a7: pickle.dump(clf7, open(filename + '_model', 'wb')) elif max_list == a8: pickle.dump(clf8, open(filename + '_model', 'wb')) elif max_list == a9: pickle.dump(clf9, open(filename + '_model', 'wb')) elif max_list == a10: pickle.dump(clf10, open(filename + '_model', 'wb')) elif max_list == a11: pickle.dump(clf11, open(filename + '_model', 'wb')) elif max_list == a12: pickle.dump(clf12, open(filename + '_model', 'wb')) pickle.dump(tfidfvect, open(filename + '_tfidfVect', 'wb')) return render_template("result.html", ac1=a1, ac2=a2, ac3=a3, ac4=a4, ac5=a5, ac6=a6, ac7=a7, ac8=a8, ac9=a9, ac10=a10, ac11=a11, ac12=a12)
# Parameters n_classes = 3 n_estimators = 30 plot_colors = "bry" plot_step = 0.02 # Load data iris = load_iris() plot_idx = 1 for pair in ([0, 1], [0, 2], [2, 3]): for model in (DecisionTreeClassifier(), RandomForestClassifier(n_estimators=n_estimators), ExtraTreesClassifier(n_estimators=n_estimators), AdaBoostClassifier(DecisionTreeClassifier(), n_estimators=n_estimators)): # We only take the two corresponding features X = iris.data[:, pair] y = iris.target # Shuffle idx = np.arange(X.shape[0]) np.random.seed(13) np.random.shuffle(idx) X = X[idx] y = y[idx] # Standardize mean = X.mean(axis=0) std = X.std(axis=0) X = (X - mean) / std # Train clf = model.fit(X, y)
temp=groups[f].median() for i in range(0,768): if (dataset.loc[i,f]==0) & (dataset.loc[i,'outcome']==0): dataset.loc[i,f]=temp[0] if (dataset.loc[i,f]==0) & (dataset.loc[i,'outcome']==1): dataset.loc[i,f]=temp[1] dataset = dataset.values X = dataset[:,0:len(dataset[0]) -1] Y = dataset[:, (len(dataset[0])-1)] #this is for decision tree data=[[0,0,0,0,0]] df=pd.DataFrame(data,columns=['feats','depth','split','max_leaf','acc']) for feats in range(2, 7): for dept in range(2, 6): acc = 0 for split in range(5,40,5): for leaf in range(7,10): for i in range(20): X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3) classifier=AdaBoostClassifier(DecisionTreeClassifier(max_depth=dept, max_features=feats,min_samples_split=split,splitter="best",criterion="entropy",max_leaf_nodes=leaf),learning_rate=1.0) classifier.fit(X_train, Y_train) res = classifier.score(X_test, Y_test) acc = acc + res acc = acc / 20 print('feats:', feats, 'Depth:', dept,'split:',split,'max_leaf',leaf, 'acc:', acc*100) df=df.append({'feats':feats,'depth':dept,'split':split,'max_leaf':leaf,'acc':acc},ignore_index=True) df.to_csv('Adaboost_result.csv', sep=',')
from matplotlib import pyplot as plt from custom_models import LoanPytorchModel #Pulling in all data from 2007-2014 wayne_all = WayneLoanApprovalLoader(savename='wayneall_indicator', csvfile='wayne_county_2007_2014.tsv') # We have some data, now lets choose a model and some metrics, before putting them into experiment objects. lr1 = LogisticRegression() lr2 = LogisticRegression() lrb1 = LogisticRegression(class_weight='balanced') lrb2 = LogisticRegression(class_weight='balanced') ada1 = AdaBoostClassifier() ada2 = AdaBoostClassifier() timemodels = [lr1, lr2] criterion = accuracy_score # Thankfully this task has a pretty easy evaluation... you either get it right or wrong # Getting temporally contiguous cuts of data, putting them into different experiments data_time1 = wayne_all.get_dates([2007, 2008, 2009, 2010]) expmt_time1 = StratifiedExperiment(timemodels[0], criterion, data_time1[:, :-1], data_time1[:, -1], test_size=0.8) data_time2 = wayne_all.get_dates([2011, 2012, 2013, 2014])
# Train and test random forests. # load_path = "../homesite_data/resources/oversampled_normalized_data_ratio_2.5.bin" load_path = "../homesite_data/resources/oversampled_normalized_data_ratio_2.bin" homesite = Data() homesite.load_sliptted_data(load_path) del homesite.test_x # Deleted to save memory. clf_ann = NeuralNetwork(path = "../homesite_data/ann_weights.bin", lr = 0.00005, \ lamb = 0) train_output_ann = clf_ann.get_hidden_output(homesite.train_x) validation_output_ann = clf_ann.get_hidden_output(homesite.validation_x) # train_output_ann = np.hstack((train_output_ann, homesite.train_x)) # validation_output_ann = np.hstack((validation_output_ann, homesite.validation_x)) for c in range(1, 10): # Train classifier. print "Training classifier." clf = AdaBoostClassifier(n_estimators=1 + 100 * c) clf.fit(train_output_ann, homesite.train_y) # Test classifier. print 'Testing classifier.' predicted_labels = clf.predict_proba(validation_output_ann)[:, 1] # Show final results. results = confusion_matrix(homesite.validation_y, np.round(predicted_labels)) accuracy, precision, recall = compute_performance_metrics(results) auc = compute_auc(homesite.validation_y, predicted_labels)
plt.title("Variance VS Components") plt.show() # Selecting the ideal number of components and fitting the data pca = PCA(n_components=35) X = pca.fit_transform(X) ### Training the models ### models = [ ("Gaussian NB", GaussianNB()), ("KNN", KNeighborsClassifier()), ("Decision Tree", DecisionTreeClassifier()), ("Random Forest", RandomForestClassifier()), ("Logistic Regression", LogisticRegression()), ("LDA", LinearDiscriminantAnalysis()), ("AdaBoost", AdaBoostClassifier()), ("QDA", QuadraticDiscriminantAnalysis()), ("Neural Net", MLPClassifier()), ("Gradient Boosting", GradientBoostingClassifier()), ("Extra Trees", ExtraTreesClassifier()), # ("SVM", SVC(kernel="linear")), ("XGBOOST Classifer", XGBClassifier()), ] ## Model comparison ### start = timeit.default_timer() accuracies = [] for name, model in models: # kfold = model_selection.KFold(n_splits=10)
train_data = train_data.dropna() train_data = preprocess_data(train_data) X = train_data[['is_1', 'is_2', 'is_3', 'Fare', 'is_male', 'is_female']] Y = train_data['Survived'] XTrain, XTest, YTrain, YTest = train_test_split(X, Y, test_size=0.2) n_estimators = 100 models = [ DecisionTreeClassifier(max_depth=3), BaggingClassifier(n_estimators=n_estimators), RandomForestClassifier(n_estimators=n_estimators), ExtraTreesClassifier(n_estimators=n_estimators), AdaBoostClassifier(n_estimators=n_estimators) ] model_title = [ 'DecisionTree', 'Bagging', 'RandomForest', 'ExtraTrees', 'AdaBoost' ] surv_preds, surv_probs, scores, fprs, tprs, thres = ([] for i in range(6)) for i, model in enumerate(models): print('Fitting {0}'.format(model_title[i])) clf = model.fit(XTrain, YTrain) surv_preds.append(model.predict(XTest)) surv_probs.append(model.predict_proba(XTest)) scores.append(model.score(XTest, YTest))
tuned_parameters = [{'n_estimators':[5, 10, 100, 200], 'criterion':['gini', 'entropy'], 'max_features':['log2', 'sqrt'], 'max_depth':[10, 100] }] algo = RandomForestClassifier() elif choice=='i' or choice=='I': print("\n**********************************\n") print(" \t AdaBoost Classifier") tuned_parameters = [{'n_estimators':[5, 10, 50, 100, 200], 'learning_rate':[0.1, 0.2, 0.5, 1], 'algorithm':['SAMME', 'SAMME.R'], 'random_state':[1, 2, 3, 5] }] algo = AdaBoostClassifier() elif choice=='j' or choice=='J': print("\n**********************************\n") print(" \t Gradient Boosting Classifier") tuned_parameters = [{'n_estimators':[5, 10, 50, 100, 200], 'learning_rate':[0.1, 0.2, 0.5, 1], 'min_impurity_decrease': [0.0001], 'max_depth':[10, 100] }] algo = GradientBoostingClassifier() elif choice=='k' or choice=='K': print("\n**********************************\n") print(" \t XG Boost") tuned_parameters = [{'n_estimators':[5, 10, 50, 100, 200],
y, test_size=0.2, random_state=42) #:# preprocessing transform_pipeline = Pipeline([('scaler', StandardScaler())]) X_train = pd.DataFrame(transform_pipeline.fit_transform(X_train), columns=X_train.columns) #:# model params = {'learning_rate': 0.5, 'n_estimators': 300} classifier = AdaBoostClassifier(**params) classifier.fit(X_train, y_train) #:# hash #:# e595f5d5683f3e3692608020cd5bde18 md5 = hashlib.md5(str(classifier).encode('utf-8')).hexdigest() print(f'md5: {md5}') #:# audit y_pred = classifier.predict(transform_pipeline.transform(X_test)) y_pred_proba = classifier.predict_proba( transform_pipeline.transform(X_test))[:, 1] tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() print(f'acc: {accuracy_score(y_test, y_pred)}')
from sklearn.neighbors import KNeighborsClassifier from sklearn.svm.classes import SVR from sklearn.tree import DecisionTreeClassifier DECISION_TREE = DecisionTreeClassifier() LOGISTIC_REGRESSION = LogisticRegression() NAIVE_BAYS = GaussianNB() K_N_N = KNeighborsClassifier() SUPPORT_VECTOR = svm.SVC(kernel="linear") # Ensemble classifiers RANDOM_FOREST = RandomForestClassifier(n_estimators=100) GRADIENT_BOOST_CL = GradientBoostingClassifier(n_estimators=100) ADA_BOOST = AdaBoostClassifier(n_estimators=100) EXTRA_TREE = ExtraTreesClassifier(n_estimators=100) # Regressors GRADIENT_BOOST_RG = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1) LINEAR_RG = LinearRegression() RIDGE_RG = Ridge() LASSO_RG = Lasso() SVR_RG = SVR() def getClassifierMap(): CLASSIFIER_MAP = { "DECISION_TREE": DECISION_TREE, "LOGISTIC_REGRESSION": LOGISTIC_REGRESSION, "NAIVE_BAYS": NAIVE_BAYS,