def run_cat(filename,modelname,fileout,embeddings,new_run=True,run_parse=True, model_type='logreg',C=10.0, alpha=1.0, cutoff=0.50, n_iter=1): # NOTE: pull relevant data and run parsing and classification data = db.getTransactionData() df = pd.DataFrame(data) # DEBUG: print(df) print(dirs,run_parse) if new_run: # initialize the model; if model_type=='logreg': model = linear_model.SGDClassifier(loss='log',warm_start=True, n_iter=n_iter,alpha=alpha) elif model_type=='passive-aggressive': model = linear_model.PassiveAggressiveClassifier(C=C,warm_start=True) elif model_type=='naive-bayes': model = naive_bayes.GaussianNB() else: raise NameError('model_type must be logreg, passive-aggressive, or naive-bayes') else: # load a saved, pre-trained model modelFileLoad = open(modelname, 'rb') model = pickle.load(modelFileLoad) #fileCities = dirs.dataDir + 'cities_by_state.pickle' #us_cities = pd.read_pickle(fileCities) df = cat_df(df,model,embeddings,new_run,run_parse,cutoff=cutoff, model_type=model_type) df.to_csv(fileout,index=False) # Saving logistic regression model from training set 1 modelFileSave = open(modelname, 'wb') pickle.dump(model, modelFileSave) modelFileSave.close()
def model_comparison_classification(k, data): # type: (int, tuple) -> dict """ Compares various classification models and their performance in analyzing a dataset using k-fold cross-validation. :param k: How many bins. :param data: Data of samples and their labels. :return: A dictionary with keys being names of classifiers and values being the k bins and their accuracy scores. """ models = { clf: None for clf in ['SVM', 'Passive-Agressive', 'Bernoulli', 'Multilayered Perceptron'] } # Data for k-fold cross-validation on various models. cv_results = [ k_fold_cv(k, data, c) for c in [ SVC(), linear.PassiveAggressiveClassifier(), bayes.BernoulliNB(), neural.MLPClassifier() ] ] models['SVM'] = cv_results[0] models['Passive-Agressive'] = cv_results[1] models['Bernoulli'] = cv_results[2] models['Multilayered Perceptron'] = cv_results[3] return models
def get_algorithms(): MLA_dict = { # Ensemble methods "ada": ensemble.AdaBoostClassifier(), "bc": ensemble.BaggingClassifier(), "etc": ensemble.ExtraTreesClassifier(), "gbc": ensemble.GradientBoostingClassifier(), "rfc": ensemble.RandomForestClassifier(), # Gaussian processes "gpc": gaussian_process.GaussianProcessClassifier(), # Linear models "lr": linear_model.LogisticRegressionCV(), "pac": linear_model.PassiveAggressiveClassifier(), "rcc": linear_model.RidgeClassifierCV(), "sgd": linear_model.SGDClassifier(), "per": linear_model.Perceptron(), # Navies bayes "bnb": naive_bayes.BernoulliNB(), "gnb": naive_bayes.GaussianNB(), # Nearest neighbour "knn": neighbors.KNeighborsClassifier(), # SVM "svc": svm.SVC(probability=True), "nvc": svm.NuSVC(probability=True), "lvc": svm.LinearSVC(), # Trees "dtc": tree.DecisionTreeClassifier(), "ets": tree.ExtraTreeClassifier(), # Discriminant analysis "lda": discriminant_analysis.LinearDiscriminantAnalysis(), "qda": discriminant_analysis.QuadraticDiscriminantAnalysis(), } return MLA_dict
def test_basic(self, single_chunk_classification): X, y = single_chunk_classification a = lm.BigPassiveAggressiveClassifier(classes=[0, 1], random_state=0) b = lm_.PassiveAggressiveClassifier(random_state=0) a.fit(X, y) b.partial_fit(X, y, classes=[0, 1]) assert_eq(a.coef_, b.coef_)
def test_sk_PassiveAggressiveClassifier(): print("Testing sklearn, PassiveAggressiveClassifier...") mod = linear_model.PassiveAggressiveClassifier() X, y = iris_data mod.fit(X, y) docs = {'name': "PassiveAggressiveClassifier test"} fv = X[0, :] upload(mod, fv, docs)
def test_basic(self, single_chunk_classification): X, y = single_chunk_classification a = lm.PartialPassiveAggressiveClassifier( classes=[0, 1], random_state=0, max_iter=100, tol=1e-3 ) b = lm_.PassiveAggressiveClassifier(random_state=0, max_iter=100, tol=1e-3) a.fit(X, y) b.partial_fit(*dask.compute(X, y), classes=[0, 1]) assert_estimator_equal(a, b, exclude=["loss_function_"])
def build_sklearn(self, model_id, model_params): """Method that builds models implemented in sklearn""" if model_id == 'sklearn_LogisticRegressionCV': return linear_model.LogisticRegressionCV(**model_params) if model_id == 'sklearn_LogisticRegression': return linear_model.LogisticRegression(**model_params) elif model_id == 'sklearn_MLPClassifier': return neural_network.MLPClassifier(**model_params) elif model_id == 'sklearn_GaussianNB': return naive_bayes.GaussianNB(**model_params) elif model_id == 'sklearn_MultinomialNB': return naive_bayes.MultinomialNB(**model_params) elif model_id == 'sklearn_BernoulliNB': return naive_bayes.BernoulliNB(**model_params) elif model_id == 'sklearn_RandomForestClassifier': return ensemble.RandomForestClassifier(**model_params) elif model_id == 'sklearn_SVC': return svm.SVC(**model_params) elif model_id == 'sklearn_AdaBoostClassifier': return ensemble.AdaBoostClassifier(**model_params) elif model_id == 'sklearn_SGDClassifier': return linear_model.SGDClassifier(**model_params) elif model_id == 'sklearn_PassiveAggressiveClassifier': return linear_model.PassiveAggressiveClassifier(**model_params) elif model_id == 'sklearn_RidgeClassifier': return linear_model.RidgeClassifier(**model_params) elif model_id == 'sklearn_DummyClassifier': return dummy.DummyClassifier(**model_params) elif model_id == 'sklearn_KNeighborsClassifier': return neighbors.KNeighborsClassifier(**model_params) elif model_id == 'sklearn_DecisionTreeClassifier': return tree.DecisionTreeClassifier(**model_params) elif model_id == 'sklearn_LinearRegression': return linear_model.LinearRegression(**model_params) elif model_id == 'sklearn_LassoCV': return linear_model.LassoCV(**model_params) elif model_id == 'sklearn_RidgeCV': return linear_model.RidgeCV(**model_params) elif model_id == 'sklearn_Ridge': return linear_model.Ridge(**model_params) elif model_id == 'sklearn_DummyRegressor': return dummy.DummyRegressor(**model_params) elif model_id == 'sklearn_RandomForestRegressor': return ensemble.RandomForestRegressor(**model_params) elif model_id == 'sklearn_GradientBoostingRegressor': return ensemble.GradientBoostingRegressor(**model_params) elif model_id == 'sklearn_MLPRegressor': return neural_network.MLPRegressor(**model_params) elif model_id == 'sklearn_KNeighborsRegressor': return neighbors.KNeighborsRegressor(**model_params) elif model_id == 'sklearn_SVR': return svm.SVR(**model_params) elif model_id == 'sklearn_SGDRegressor': return linear_model.SGDRegressor(**model_params) elif model_id == 'sklearn_DecisionTreeRegressor': return tree.DecisionTreeRegressor(**model_params) return None
def run_cat(filename, modelname, fileout, embeddings, new_run=True, run_parse=True, model_type='logreg', C=10.0, alpha=1.0, cutoff=0.50, n_iter=1): # pull relevant data and run parsing and classification df = pd.read_csv(filename) if (len(df.columns) == 2): # make sure columns have the right names df.columns = ['raw', 'amount'] if new_run: # initialize the model; if model_type == 'logreg': model = linear_model.SGDClassifier(loss='log', warm_start=True, n_iter=n_iter, alpha=alpha) elif model_type == 'passive-aggressive': model = linear_model.PassiveAggressiveClassifier(C=C, warm_start=True) elif model_type == 'naive-bayes': model = naive_bayes.GaussianNB() else: raise NameError( 'model_type must be logreg, passive-aggressive, or naive-bayes' ) else: # load a saved, pre-trained model modelFileLoad = open(modelname, 'rb') model = pickle.load(modelFileLoad) fileCities = dirs.data_dir + 'cities_by_state.pickle' us_cities = pd.read_pickle(fileCities) df = cat_df(df, model, us_cities, embeddings, new_run, run_parse, cutoff=cutoff, model_type=model_type) df.to_csv(fileout, index=False) # Saving logistic regression model from training set 1 modelFileSave = open(modelname, 'wb') pickle.dump(model, modelFileSave) modelFileSave.close()
def train_test(x_tr, y_tr, x_te, y_te, name): algorithms = { 'ada_boost': ensemble.AdaBoostClassifier(), 'bagging': ensemble.BaggingClassifier(), 'extra_trees': ensemble.ExtraTreesClassifier(), 'random_forest': ensemble.RandomForestClassifier(), 'logistic_regression': linear_model.LogisticRegression(), 'passive_aggressive': linear_model.PassiveAggressiveClassifier(), 'ridge': linear_model.RidgeClassifier(), 'sgd': linear_model.SGDClassifier(), 'bernoulli': naive_bayes.BernoulliNB(), 'gaussian': naive_bayes.GaussianNB(), 'k_neighbors': neighbors.KNeighborsClassifier(), 'nearest_centroid': neighbors.NearestCentroid(), 'mlp': neural_network.MLPClassifier(), 'linear_svc': svm.LinearSVC(), 'decision_tree': tree.DecisionTreeClassifier(), 'extra_tree': tree.ExtraTreeClassifier(), 'gradient_boosting': ensemble.GradientBoostingClassifier(), 'hist_gradient_boosting': HistGradientBoostingClassifier() } res = {} try: clf = GridSearchCV(algorithms.get(name), getattr(CVParameters, name), cv=2, n_jobs=-1) start = time.clock() clf.fit(x_tr, y_tr) tr_time = time.clock() - start print(tr_time) print(clf.best_params_) print(clf.best_score_) tr_score = clf.score(x_tr, y_tr) score = clf.score(x_te, y_te) tr_fscore = f1_score(y_tr, clf.predict(x_tr), average='weighted') fscore = f1_score(y_te, clf.predict(x_te), average='weighted') print(tr_score, score, tr_fscore, fscore) res = { name: { 'test': score, 'train': tr_score, 'f1_test': fscore, 'f1_train': tr_fscore, 'tr_time': tr_time } } res[name].update(clf.best_params_) except Exception as e: print(e) return res
def sklearn_liner_model_regressions(xTrain, xTest, yTrain, yTest): modelForConsideration: DataFrame = pd.DataFrame() LinerModels = \ [ linear_model.ARDRegression(), linear_model.BayesianRidge(), linear_model.ElasticNet(), linear_model.ElasticNetCV(), linear_model.HuberRegressor(), linear_model.Lars(), linear_model.LarsCV(), linear_model.Lasso(), linear_model.LassoCV(), linear_model.LassoLars(), linear_model.LassoLarsCV(), linear_model.LassoLarsIC(), linear_model.LinearRegression(), linear_model.MultiTaskLasso(), linear_model.MultiTaskElasticNet(), linear_model.MultiTaskLassoCV(), linear_model.MultiTaskElasticNetCV(), linear_model.OrthogonalMatchingPursuit(), linear_model.OrthogonalMatchingPursuitCV(), linear_model.PassiveAggressiveClassifier(), linear_model.PassiveAggressiveRegressor(), linear_model.Perceptron(), linear_model.RANSACRegressor(), linear_model.Ridge(), linear_model.RidgeClassifier(), linear_model.RidgeClassifierCV(), linear_model.RidgeCV(), linear_model.SGDClassifier(), linear_model.SGDRegressor(), linear_model.TheilSenRegressor(), linear_model.enet_path(xTrain, yTrain), linear_model.lars_path(xTrain, yTrain), linear_model.lasso_path(xTrain, yTrain), # linear_model.LogisticRegression() # ,linear_model.LogisticRegressionCV(),linear_model.logistic_regression_path(xTrain, yTrain), linear_model.orthogonal_mp(xTrain, yTrain), linear_model.orthogonal_mp_gram(), linear_model.ridge_regression() ] for model in LinerModels: modelName: str = model.__class__.__name__ try: # print(f"Preparing Model {modelName}") if modelName == "LogisticRegression": model = linear_model.LogisticRegression(random_state=0) model.fit(xTrain, yTrain) yTrainPredict = model.predict(xTrain) yTestPredict = model.predict(xTest) errorList = calculate_prediction_error(modelName, yTestPredict, yTest, yTrainPredict, yTrain) if errorList["Test Average Error"][0] < 30 and errorList[ "Train Average Error"][0] < 30: try: modelForConsideration = modelForConsideration.append( errorList) except (Exception) as e: print(e) except (Exception, ArithmeticError) as e: print(f"Error occurred while preparing Model {modelName}") return modelForConsideration
def build(self, **kwargs): """ builds and returns estimator Args: hyperparameters (dictionary): Dictionary of hyperparameters to be used for tuning the estimator. **kwargs (key-value arguments): Ignored in this implementation. Added for compatibility with :func:`mlaut.estimators.nn_estimators.Deep_NN_Classifier`. Returns: `sklearn pipeline` object: pipeline for transforming the features and training the estimator """ estimator = GridSearchCV(linear_model.PassiveAggressiveClassifier(), self._hyperparameters, verbose=self._verbose, n_jobs=self._n_jobs, refit=self._refit, cv=self._num_cv_folds) return self._create_pipeline(estimator=estimator)
def ModelSelection(test_data, features, label): MLA = [ ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), gaussian_process.GaussianProcessClassifier(), linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), naive_bayes.BernoulliNB(), naive_bayes.GaussianNB(), neighbors.KNeighborsClassifier(), svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(), tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), discriminant_analysis.LinearDiscriminantAnalysis(), discriminant_analysis.QuadraticDiscriminantAnalysis(), ] MLA_columns = ['MLA Name', 'MLA Parameters', 'MLA Score'] MLA_compare = pd.DataFrame(columns=MLA_columns) x_train, x_test, y_train, y_test = train_test_split(train_data[features], train_data[label], test_size=0.2) row_index = 0 MLA_predict = train_data[label] for alg in MLA: MLA_name = alg.__class__.__name__ MLA_compare.loc[row_index, 'MLA Name'] = MLA_name MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params()) alg.fit(x_train, y_train) MLA_predict[MLA_name] = alg.predict(x_test) MLA_compare.loc[row_index, 'MLA Score'] = alg.score(x_test, y_test) row_index += 1 MLA_compare.sort_values(by=['MLA Score'], ascending=False, inplace=True) return MLA_compare, x_train, x_test, y_train, y_test
def all_classifiers(): # Model Data MLA = [ # Ensemble Methods ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), # Gaussian Processes gaussian_process.GaussianProcessClassifier(), # GLM linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), # Navies Bayes naive_bayes.BernoulliNB(), naive_bayes.GaussianNB(), # Nearest Neighbor neighbors.KNeighborsClassifier(), # SVM svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(), # Trees tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), # Discriminant Analysis discriminant_analysis.LinearDiscriminantAnalysis(), discriminant_analysis.QuadraticDiscriminantAnalysis(), # xgboost: http://xgboost.readthedocs.io/en/latest/model.html XGBClassifier() ] return MLA
Methodes = [ #Ensemble Methods ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), #Gaussian Processes #gaussian_process.GaussianProcessClassifier(), #GLM linear_model.LogisticRegressionCV(), linear_model.LogisticRegression(C=1000, random_state=0, solver='liblinear'), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), #Navies Bayes naive_bayes.BernoulliNB(), #naive_bayes.GaussianNB(), #Nearest Neighbor neighbors.KNeighborsClassifier(), #SVM svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(),
def passive_aggressive_classifiers(): pa = OneVsRestClassifier(linear_model.PassiveAggressiveClassifier()) return pa
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) shuffle_index = np.random.permutation(len(X_train)) X_train, y_train = X_train[shuffle_index], y_train[shuffle_index] # Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) from sklearn import linear_model clf = linear_model.PassiveAggressiveClassifier(random_state=0) clf.fit(X_train, y_train) # Cross Validation from sklearn.model_selection import cross_val_score from sklearn.metrics import confusion_matrix from sklearn.model_selection import cross_val_predict cross_val_score(clf, X_train, y_train, cv=3, scoring='accuracy') y_train_pred = cross_val_predict(clf, X_train, y_train, cv=3) cm = confusion_matrix(y_train, y_train_pred) print(cm) from sklearn.metrics import precision_score, recall_score print("precision score = {0:.4f}".format(precision_score(y_train, y_train_pred))) print("recall score = {0:.4f}".format(recall_score(y_train, y_train_pred)))
def get_skl_estimator(self, **default_parameters): return linear_model.PassiveAggressiveClassifier(**default_parameters)
def main(): train_df = pd.read_csv("train.csv") test_df = pd.read_csv("test.csv") combine = [train_df, test_df] for df in combine: df.info() standardize_data(df) create_columns(df) create_bins(df) encode_data(df) # Define target (Y variable) target = ["Survived"] # Define features (X variables) train_df_x = [ "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "FamilySize", "IsAlone", "Title", ] # Define numerical features (binned and encoded) train_df_x_bin = [ "Pclass", "Sex_Code", "AgeBin_Code", "FareBin_Code", "Embarked_Code", "FamilySize", "IsAlone", "Title_Code", ] # Analyze feature correlation with target for x in train_df_x: if train_df[x].dtype != "float64": print(train_df[[x, target[0]]].groupby(x).mean()) # Graph individual features by survival fig, axis = plt.subplots(1, 3, figsize=(9, 6)) sns.histplot(x="Fare", data=train_df, hue="Survived", multiple="stack", ax=axis[0]) sns.histplot(x="Age", data=train_df, hue="Survived", multiple="stack", ax=axis[1]) sns.histplot(x="FamilySize", data=train_df, hue="Survived", multiple="stack", ax=axis[2]) fig, axis = plt.subplots(2, 3, figsize=(16, 12)) sns.barplot(x="Pclass", y="Survived", data=train_df, ax=axis[0, 0]) sns.barplot(x="Sex", y="Survived", data=train_df, ax=axis[0, 1]) sns.barplot(x="Embarked", y="Survived", data=train_df, ax=axis[0, 2]) sns.barplot(x="IsAlone", y="Survived", data=train_df, ax=axis[1, 0]) sns.barplot(x="Title", y="Survived", data=train_df, ax=axis[1, 1]) # Compare class with a 2nd feature fig, axis = plt.subplots(1, 3, figsize=(9, 6)) sns.barplot(x="Pclass", y="Survived", data=train_df, hue="Sex", ax=axis[0]) sns.barplot(x="Pclass", y="Survived", data=train_df, hue="IsAlone", ax=axis[1]) sns.barplot(x="Pclass", y="Survived", data=train_df, hue="Embarked", ax=axis[2]) # Compare Sex with a 2nd feature fig, axis = plt.subplots(1, 3, figsize=(9, 6)) sns.barplot(x="Sex", y="Survived", data=train_df, hue="Pclass", ax=axis[0]) sns.barplot(x="Sex", y="Survived", data=train_df, hue="IsAlone", ax=axis[1]) sns.barplot(x="Sex", y="Survived", data=train_df, hue="Embarked", ax=axis[2]) # Correlation heatmap of dataset fig, ax = plt.subplots(figsize=(14, 12)) fig = sns.heatmap( train_df.corr(), cmap=sns.diverging_palette(240, 10, as_cmap=True), annot=True, ax=ax, ) # Machine Learning Algorithm (MLA) selection and initialization mla = [ linear_model.LogisticRegressionCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(dual=False), neighbors.KNeighborsClassifier(), gaussian_process.GaussianProcessClassifier(), naive_bayes.GaussianNB(), naive_bayes.BernoulliNB(), tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), ensemble.BaggingClassifier(), ensemble.RandomForestClassifier(), ensemble.ExtraTreesClassifier(), ensemble.AdaBoostClassifier(), ensemble.GradientBoostingClassifier(), ] mla_compare = test_models(mla, train_df, train_df_x_bin, target) best_estimator = optimize_params(mla, mla_compare, train_df, train_df_x_bin, target) generate_submission_csv(test_df, train_df_x_bin, best_estimator)
print(name, '.8 - val score:', val_score) if val_score > best_val_score: print('New best val score!! image_feature_model:', image_features_model_name, 'clf:', name) best_clf = clf best_val_score = val_score best_clf_name = name except Exception as e: print("Exception 2!", e) best_clf_name = None best_clf = None best_val_score = 0.97 validate_score_clf( linear_model.PassiveAggressiveClassifier(max_iter=1100, loss='hinge'), 'linear_model.PassiveAggressiveClassifier-loss-hinge') validate_score_clf( linear_model.PassiveAggressiveClassifier(max_iter=700, loss='hinge'), 'linear_model.PassiveAggressiveClassifier-loss-hinge-700') validate_score_clf( linear_model.PassiveAggressiveClassifier(max_iter=700, loss='hinge', class_weight='balanced'), 'linear_model.PassiveAggressiveClassifier-loss-hinge-700-balanced') validate_score_clf( linear_model.PassiveAggressiveClassifier(max_iter=1100, loss='hinge', class_weight='balanced'), 'linear_model.PassiveAggressiveClassifier-loss-hinge-1100-balanced') validate_score_clf(
def parse_param_and_get_model(param_dict): #param_dict = json.loads(j_str) model_name = param_dict['learning_algorithm'] # 1: linear_svm; 2: ; 3: cv = eval(param_dict['cv']) mode = param_dict['mode'] api = param_dict['api'] print "INFO: Learning Algorithm: ", model_name print "INFO: CV = ", cv print "INFO: mode = ", mode print "INFO: API use: ", api ###parse and print print parameters### print "INFO: ============ Learning Algorithm and Grid Search Parameters =============" if model_name == "linear_svm": ### 1: linearSVM if mode == "cheap": param_dic = [{'C': [0.0001, 0.01, 1, 100, 10000]}] else: param_dic = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}] print "INFO: Grid Search Parameters:" print "INFO: C = ", param_dic[0]['C'] print "INFO: ====================1: Linear SVM=============" clf = svm.LinearSVC() elif model_name == "svm": ### 2: SVM with kernel if mode == "cheap": param_dic = [{'C': [0.01, 1, 100], 'kernel':['rbf','sigmoid'], 'gamma':[0.0, 0.5]}, {'C': [0.01, 1, 100], 'kernel':['linear']}, {'C': [0.01, 1, 100], 'kernel':['poly'], 'gamma':[0.0, 0.5], 'degree':[3]}] else: param_dic = [{'C': [0.0001, 0.01, 1, 100, 10000], 'kernel':['rbf','sigmoid'], 'gamma':[0.0, 0.5, 1]}, {'C': [0.0001, 0.01, 1, 100, 10000], 'kernel':['linear']}, {'C': [0.0001, 0.01, 1, 100, 10000], 'kernel':['poly'], 'gamma':[0.0, 0.5], 'degree':[2,3]}] #param_dic = [{'C': [0.0001, 0.01, 1, 100, 10000], 'kernel':['rbf','sigmoid'], 'gamma':[0.0, 0.5, 1]}, {'C': [0.0001, 0.01, 1, 100, 10000], 'kernel':['linear']}, {'C': [0.0001, 0.01, 1, 100, 10000], 'kernel':['poly'], 'gamma':[0.0, 0.5, 1], 'degree':[2,3]}] #param_dic = [{'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'kernel':['rbf','sigmoid'], 'gamma':[0.0, 0.01, 0.1, 1, 10, 100]}, {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'kernel':['linear']}, {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'kernel':['poly'], 'gamma':[0.0, 0.01, 0.1, 1, 10, 100], 'degree':[2,3,4]}] print "INFO: Grid Search Parameters:" for p in range (0, len(param_dic)): print "INFO: ", for key in param_dic[p]: print key, ' = ', param_dic[p][key], print "" print "INFO: ====================2: SVM with kernel=============" clf = svm.SVC() elif model_name == "nu_svm": ### 3: NuSVC if mode == "cheap": param_dic = [{'nu': [0.1, 0.3], 'kernel':['rbf','sigmoid'], 'gamma':[0.0, 0.1]}, {'nu': [0.1, 0.3], 'kernel':['linear']}, {'nu': [0.1, 0.3], 'kernel':['poly'], 'gamma':[0.0, 0.1], 'degree':[3]}] else: param_dic = [{'nu': [0.1, 0.2, 0.3], 'kernel':['rbf','sigmoid'], 'gamma':[0.0, 0.1, 1, 10]}, {'nu': [0.1, 0.2, 0.3], 'kernel':['linear']}, {'nu': [0.1, 0.2, 0.3], 'kernel':['poly'], 'gamma':[0.0, 0.1, 1, 10], 'degree':[2,3]}] #param_dic = [{'nu': [0.1, 0.2, 0.3, 0.4], 'kernel':['rbf','sigmoid'], 'gamma':[0.0, 0.1, 1, 10]}, {'nu': [0.1, 0.2, 0.3, 0.4], 'kernel':['linear']}, {'nu': [0.1, 0.2, 0.3, 0.4], 'kernel':['poly'], 'gamma':[0.0, 0.1, 1, 10], 'degree':[2,3]}] print "INFO: Grid Search Parameters:" for p in range (0, len(param_dic)): print "INFO: ", for key in param_dic[p]: print key, ' = ', param_dic[p][key], print "" print "INFO: ====================3: NuSVC=============" clf = svm.NuSVC() elif model_name == "logistic_regression": ### 4: Logistic Regression if mode == "cheap": param_dic = [{'C': [0.0001, 0.01, 1, 100, 10000], 'penalty':['l2']}] else: param_dic = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 'penalty':['l2', 'l1']}] print "INFO: Grid Search Parameters:" print "INFO: C= ", param_dic[0]['C'] print "INFO: penalty= ", param_dic[0]['penalty'] print "INFO: ====================4: Logistic Regression=============" clf = linear_model.LogisticRegression() elif model_name == "passive_aggressive_classifier": ### 6: Passive Aggressive Classifier if mode == "cheap": param_dic = [{'C': [0.0001, 0.01, 1, 100, 10000]}] else: param_dic = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}] print "INFO: Grid Search Parameters:" print "INFO: C= ", param_dic[0]['C'] print "INFO: ====================6: Passive Aggressive Classifier=============" clf = linear_model.PassiveAggressiveClassifier() else: print "INFO: Training model selection error: no valid ML model selected!" return (0, "none", 0, 0, 0) return (clf, model_name, api, cv, param_dic)
def parse_param_and_get_model(param_dict): # get model name if 'learning_algorithm' in param_dict: model_name = param_dict['learning_algorithm'] else: print "ERROR: learning_algorithm not found" return (0, "none") ###parse and print print parameters### print "INFO: ============ Learning Algorithm", model_name, "=============" if model_name == "linear_svm": ### 1: linearSVM C = eval(param_dict['c']) C = float(C) print "INFO: C = ", C print "INFO: ==================== 1: Linear SVM =============" clf = svm.LinearSVC(C=C) elif model_name == "svm": ### 2: SVM with kernel C = eval(param_dict['c']) C = float(C) kernel_func = param_dict['kernel'] gamma_val = eval(param_dict['gamma']) gamma_val = float(gamma_val) print "INFO: C = ", C print "INFO: kernel = ", kernel_func print "INFO: gamma = ", gamma_val if kernel_func == "poly": degree_num = eval(param_dict['degree']) print "degree = ", degree_num print "==================== 2: SVM with kernel =============" if kernel_func == "poly": clf = svm.SVC(C=C, kernel=kernel_func, gamma=gamma_val, degree=degree_num) elif kernel_func == "rbf" or kernel_func == "sigmoid": clf = svm.SVC(C=C, kernel=kernel_func, gamma=gamma_val) else: clf = svm.SVC(C=C, kernel=kernel_func) elif model_name == "nu-svm": ### 3: NuSVC nu_val = eval(param_dict['nu']) nu_val = float(nu_val) kernel_func = param_dict['kernel'] gamma_val = eval(param_dict['gamma']) gamma_val = float(gamma_val) print "INFO: nu = ", nu_val print "INFO: kernel = ", kernel_func print "INFO: gamma = ", gamma_val if kernel_func == "poly": degree_num = eval(param_dict['degree']) print "INFO: degree = ", degree_num print "INFO: ==================== 3: NuSVC =============" if kernel_func == "poly": clf = svm.NuSVC(nu=nu_val, kernel=kernel_func, gamma=gamma_val, degree=degree_num) elif kernel_func == "rbf" or kernel_func == "sigmoid": clf = svm.NuSVC(nu=nu_val, kernel=kernel_func, gamma=gamma_val) else: clf = svm.NuSVC(nu=nu_val, kernel=kernel_func) elif model_name == "logistic_regression": ### 4: linearSVM C = eval(param_dict['c']) C = float(C) regularization = param_dict['regularization'] print "INFO: C = ", C print "INFO: penalty = ", regularization print "INFO: ==================== 4: Logistic Regression =============" clf = linear_model.LogisticRegression(C=C, penalty=regularization) elif model_name == "linear_svm_with_sgd": ### 5: linearSVM with SGD, no para as input print "INFO: ==================== 5: Linear SVM with SGD =============" clf = linear_model.SGDClassifier() elif model_name == "passive_aggressive_classifier": ### 6: Passive Aggressive Classifier C = eval(param_dict['c']) C = float(C) print "INFO: C = ", C print "INFO: ==================== 6: Passive Aggressive Classifier =============" clf = linear_model.PassiveAggressiveClassifier(C=C) elif model_name == "perceptron": ### 7: Perceptron print "INFO: ==================== 7: Perceptron =============" clf = linear_model.Perceptron() else: print "ERROR: Training model not supported:", model_name return (0, "none") return (clf, model_name)
#Machine Learning Algorithm (MLA) Selection and initialization CLF = [ #Ensemble Methods ('ada', ensemble.AdaBoostClassifier(tree.DecisionTreeClassifier())), ('bc', ensemble.BaggingClassifier()), ('etc', ensemble.ExtraTreesClassifier()), ('gbc', ensemble.GradientBoostingClassifier()), ('xgbc', xgb.XGBClassifier(max_depth=3)), # xgb.XGBClassifier()), # ('rfc', ensemble.RandomForestClassifier(n_estimators=50)), #Gaussian Processes ('gpc', gaussian_process.GaussianProcessClassifier()), #GLM - remove linear models, since this is a classifier algorithm ('lr', linear_model.LogisticRegressionCV()), ('pac', linear_model.PassiveAggressiveClassifier()), ('rc', linear_model.RidgeClassifierCV()), ('sgd', linear_model.SGDClassifier()), ('pct', linear_model.Perceptron()), #Navies Bayes ('gnb', naive_bayes.GaussianNB()), #Nearest Neighbor ('knn', neighbors.KNeighborsClassifier(n_neighbors=3)), #SVM ('svc', svm.SVC(probability=True)), ('lsvc', svm.LinearSVC()), #Trees
def label_learner_pa(): "return a keyed instance of passive aggressive learner" learner = sk.PassiveAggressiveClassifier(C=LOCAL_C, n_iter=LOCAL_N_ITER, class_weight=LOCAL_CLASS_WEIGHT) return Keyed('pa', SklearnLabelClassifier(learner))
df_list=['classifier_name','acc_train','acc_test','loss_train','loss_test'] clf=[linear_model.LogisticRegression(solver='liblinear',multi_class='ovr'), linear_model.LogisticRegressionCV(solver='liblinear',multi_class='ovr'), linear_model.SGDClassifier(max_iter=1000,tol=0.00001), linear_model.RidgeClassifier(),linear_model.RidgeClassifierCV(), LinearDiscriminantAnalysis(),QuadraticDiscriminantAnalysis(), svm.LinearSVC(),svm.SVC(gamma='scale',C=10.0,kernel='poly'), svm.NuSVC(gamma='scale',kernel='poly'), KNeighborsClassifier(),RadiusNeighborsClassifier(radius=30), NearestCentroid(), DecisionTreeClassifier(),ExtraTreeClassifier(),GaussianNB(), BernoulliNB(),MultinomialNB(), BaggingClassifier(),RandomForestClassifier(n_estimators=64), AdaBoostClassifier(),GradientBoostingClassifier(), linear_model.Perceptron(max_iter=1000,tol=0.00001), linear_model.PassiveAggressiveClassifier(max_iter=1000,tol=0.00001), GaussianProcessClassifier(),LabelPropagation(),LabelSpreading()] list3clf=['LogisticRegression','LogisticRegressionCV','SGDClassifier', 'RidgeClassifier', 'RidgeClassifierCV', 'LinearDiscriminantAnalysis','QuadraticDiscriminantAnalysis', 'LinearSVC', 'SVC','NuSVC', 'KNeighborsClassifier','RadiusNeighborsClassifier','NearestCentroid', 'DecisionTreeClassifier','ExtraTreeClassifier', 'GaussianNB','BernoulliNB','MultinomialNB', 'BaggingClassifier','RandomForestClassifier', 'AdaBoostClassifier','GradientBoostingClassifier', 'Perceptron','PassiveAggressiveClassifier'] y3clf=[] for i in range(len(list3clf)): y3clf.append(classifier_fit_score(clf[i],list3clf[i],'Digits',
def compare_algorithm(data, target): x_train, x_cross, y_train, y_cross = train_test_split(data, target) MLA = [ # Ensemble Methods ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), # Gaussian Processes gaussian_process.GaussianProcessClassifier(), # GLM linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(max_iter=1000, tol=0.001), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(max_iter=1000, tol=0.001), linear_model.Perceptron(max_iter=1000, tol=0.001), # Navies Bayes naive_bayes.BernoulliNB(), naive_bayes.GaussianNB(), # Nearest Neighbor neighbors.KNeighborsClassifier(), # SVM svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(), # Trees tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), # Discriminant Analysis discriminant_analysis.LinearDiscriminantAnalysis(), discriminant_analysis.QuadraticDiscriminantAnalysis(), # xgboost: http://xgboost.readthedocs.io/en/latest/model.html xgb.XGBClassifier() ] MLA_columns = [] MLA_compare = pd.DataFrame(columns=MLA_columns) row_index = 0 for alg in MLA: predicted = alg.fit(x_train, y_train).predict(x_cross) fp, tp, th = roc_curve(y_cross, predicted) MLA_name = alg.__class__.__name__ MLA_compare.loc[row_index, 'MLA Name'] = MLA_name MLA_compare.loc[row_index, 'MLA Train Accuracy'] = round( alg.score(x_train, y_train), 4) MLA_compare.loc[row_index, 'MLA Test Accuracy'] = round( alg.score(x_cross, y_cross), 4) MLA_compare.loc[row_index, 'MLA Precission'] = precision_score( y_cross, predicted) MLA_compare.loc[row_index, 'MLA Recall'] = recall_score(y_cross, predicted) MLA_compare.loc[row_index, 'MLA AUC'] = auc(fp, tp) row_index = row_index + 1 MLA_compare.sort_values(by=['MLA Test Accuracy'], ascending=False, inplace=True) print(MLA_compare)
def parse_para_and_get_model(param_dict): #param_dict = json.loads(ml_opts_jstr) model_name = param_dict['learning_algorithm'] # 1: linear_svm; 2: ; 3: ###parse and print print parameters### print "INFO: ============Learning Algorithm and Parameters=============" print "INFO: param_dict=", param_dict if model_name == "linear_svm": ### 1: linearSVM C = eval(param_dict['c']) C = float(C) print "INFO: Learning Algorithm: ", model_name print "INFO: C = ", C print "INFO: ====================1: Linear SVM=============" clf = svm.LinearSVC(C=C) elif model_name == "svm": ### 2: SVM with kernel C = eval(param_dict['c']) C = float(C) kernel_func = param_dict['kernel'] gamma_val = "0.0" if 'gamma' in param_dict: gamma_val = eval(param_dict['gamma']) gamma_val = float(gamma_val) print "INFO: Learning Algorithm: ", model_name print "INFO: C = ", C print "INFO: kernel = ", kernel_func print "INFO: gamma = ", gamma_val if kernel_func == "poly": degree_num = eval(param_dict['degree']) print "degree = ", degree_num print "INFO: ====================2: SVM with kernel=============" if kernel_func == "poly": clf = svm.SVC(C=C, kernel=kernel_func, gamma=gamma_val, degree=degree_num) elif kernel_func == "rbf" or kernel_func == "sigmoid": clf = svm.SVC(C=C, kernel=kernel_func, gamma=gamma_val) else: clf = svm.SVC(C=C, kernel=kernel_func) elif model_name == "nu_svm": ### 3: NuSVC nu_val = eval(param_dict['nu']) nu_val = float(nu_val) kernel_func = param_dict['kernel'] gamma_val = eval(param_dict['gamma']) gamma_val = float(gamma_val) print "INFO: Learning Algorithm: ", model_name print "INFO: nu = ", nu_val print "INFO: kernel = ", kernel_func print "INFO: gamma = ", gamma_val if kernel_func == "poly": degree_num = eval(param_dict['degree']) print "INFO: degree = ", degree_num print "INFO: ====================3: NuSVC=============" if kernel_func == "poly": clf = svm.NuSVC(nu=nu_val, kernel=kernel_func, gamma=gamma_val, degree=degree_num) elif kernel_func == "rbf" or kernel_func == "sigmoid": clf = svm.NuSVC(nu=nu_val, kernel=kernel_func, gamma=gamma_val) else: clf = svm.NuSVC(nu=nu_val, kernel=kernel_func) elif model_name == "logistic_regression": ### 4: linearSVM C = eval(param_dict['c']) C = float(C) # penalty from CV, regularization from non-CV training if 'regularization' in param_dict: regularization = param_dict['regularization'] elif 'penalty' in param_dict: regularization = param_dict['penalty'] print "INFO: Learning Algorithm: ", model_name print "INFO: C = ", C print "INFO: penalty = ", regularization print "INFO: ====================4: Logistic Regression=============" clf = linear_model.LogisticRegression(C=C, penalty=regularization) elif model_name == "linear_svm_with_sgd": ### 5: linearSVM with SGD, no para as input print "INFO: Learning Algorithm: ", model_name print "INFO: ====================5: Linear SVM with SGD=============" clf = linear_model.SGDClassifier() elif model_name == "passive_aggressive_classifier": ### 6: Passive Aggressive Classifier C = eval(param_dict['c']) C = float(C) print "INFO: Learning Algorithm: ", model_name print "INFO: C = ", C print "INFO: ====================6: Passive Aggressive Classifier=============" clf = linear_model.PassiveAggressiveClassifier(C=C) elif model_name == "perceptron": ### 7: Perceptron print "INFO: Learning Algorithm: ", model_name print "INFO: ====================7: Perceptron=============" clf = linear_model.Perceptron() else: print "INFO: Training model selection error: no valid ML model selected!" return (0, "none") return (clf, model_name)
# Level 2 Score: clf = linear_model.LogisticRegression(solver='sag', random_state=rnd, verbose=0, n_jobs=-1) model_sum = blend_proba(clf=clf, X_train=train, y=target, X_test=test, nfolds=5, seed=rnd, category="classifier", filename = "LogReg", setused=setused) # Level 2 Score: clf = linear_model.RidgeCV(cv = 5) model_sum = blend_proba(clf=clf, X_train=train, y=target, X_test=test, nfolds=5, seed=rnd, category="regressor", filename = "RidgeCV", setused=setused) # Level 2 Score: clf = linear_model.PassiveAggressiveClassifier(n_iter=100, random_state=rnd, verbose=0, n_jobs=-1) model_sum = blend_proba(clf=clf, X_train=train, y=target, X_test=test, nfolds=5, seed=rnd, category="regressor", filename = "PasAggC", setused=setused, tag = "1") # Level 2 Score: clf = linear_model.PassiveAggressiveClassifier(n_iter=100, loss='squared_hinge', random_state=rnd, verbose=0, n_jobs=-1) model_sum = blend_proba(clf=clf, X_train=train, y=target, X_test=test, nfolds=5, seed=rnd, category="regressor", filename = "PasAggC", setused=setused, tag = "2") # Level 2 Score: clf = linear_model.PassiveAggressiveRegressor(n_iter=100, random_state=rnd, verbose=0) model_sum = blend_proba(clf=clf, X_train=train, y=target, X_test=test, nfolds=5, seed=rnd, category="regressor", filename = "PasAggR", setused=setused, tag = "1")
xx, yy = np.dot(R, [xx, yy]) ## skalowanie xx /= max(np.absolute(xx)) yy /= max(np.absolute(yy)) ## przypisanie do X X[row, ::2] = xx X[row, 1::2] = yy ## Rozdzielenie danych do późniejszego liczenia 'accuracy' i 'confusion matrix' X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.1, stratify=y) ## UTWORZENIE OBIEKTU KLASYFIKATORA clf = linear_model.PassiveAggressiveClassifier(C=60.69620253164557, fit_intercept=False, max_iter=10000, n_jobs=-1) ## CROSS-VALIDACJA scores = model_selection.cross_validate(clf, X_train, y_train, return_estimator=True, n_jobs=-1) print('The score array for test scores on each cv split:', scores['test_score']) print('Mean of above:', scores['test_score'].mean()) ## WYBRANIE NAJLEPSZEGO ESTYMATORA I PREDYKCJA DLA WSZYTKICH DANYCH best_clf = scores['estimator'][np.argmax(scores['test_score'])] print('Accuracy on final set:', best_clf.score(X_test, y_test))
# Lightning Linear Regression regression(light_reg.AdaGradRegressor(random_state=RANDOM_SEED)), regression(light_reg.CDRegressor(random_state=RANDOM_SEED)), regression(light_reg.FistaRegressor()), regression(light_reg.SAGARegressor(random_state=RANDOM_SEED)), regression(light_reg.SAGRegressor(random_state=RANDOM_SEED)), regression(light_reg.SDCARegressor(random_state=RANDOM_SEED)), # Sklearn Linear Classifiers classification( linear_model.LogisticRegression(random_state=RANDOM_SEED)), classification( linear_model.LogisticRegressionCV(random_state=RANDOM_SEED)), classification( linear_model.PassiveAggressiveClassifier( random_state=RANDOM_SEED)), classification(linear_model.Perceptron(random_state=RANDOM_SEED)), classification(linear_model.RidgeClassifier(random_state=RANDOM_SEED)), classification(linear_model.RidgeClassifierCV()), classification(linear_model.SGDClassifier(random_state=RANDOM_SEED)), classification_binary( linear_model.LogisticRegression(random_state=RANDOM_SEED)), classification_binary( linear_model.LogisticRegressionCV(random_state=RANDOM_SEED)), classification_binary( linear_model.PassiveAggressiveClassifier( random_state=RANDOM_SEED)), classification_binary( linear_model.Perceptron(random_state=RANDOM_SEED)), classification_binary( linear_model.RidgeClassifier(random_state=RANDOM_SEED)),
def label_learner_pa(): "return a keyed instance of passive aggressive learner" learner = sk.PassiveAggressiveClassifier(n_iter=LOCAL_PA_ARGS.iterations) return Keyed('pa', SklearnLabelClassifier(learner))