class kernelsvm(): def __init__(self, theta0, alpha, loss_metric): self.theta0 = theta0 self.alpha = alpha self.loss_metric = loss_metric def fit(self, X, y, idx_SR): n_SR = len(idx_SR) self.feature_map_nystroem = General_Nystroem(kernel='rbf', gamma=self.theta0, n_components=n_SR) X_features = self.feature_map_nystroem.fit_transform(X,idx_SR) print("fitting SGD") self.clf = SGDClassifier(loss=self.loss_metric,alpha=self.alpha) self.clf.fit(X_features, y) print("fitting SGD finished") def predict(self, X): print("Predicting") X_transform = self.feature_map_nystroem.transform(X) return self.clf.predict(X_transform), X_transform def decision_function(self, X): # X should be the transformed input! return self.clf.decision_function(X) def err_rate(self, y_true, y_pred): acc = accuracy_score(y_true, y_pred) err_rate = 1.0-acc return err_rate def get_params(self): return self.clf.get_params()
def main(input_path, output_path): client = MlflowClient() experiment = client.get_experiment_by_name("iris") with mlflow.start_run(experiment_id=experiment.experiment_id): # Import dataset logging.info(f"reading {input_path}") mlflow.log_artifact(input_path) iris = pd.read_csv(input_path) X = iris.drop("Species", axis=1) y = iris.Species # Instantiate PCA pca = PCA() # Instatiate logistic_regression logistic = SGDClassifier(loss='log', penalty='l2', max_iter=100, tol=1e-3, random_state=0) mlflow.log_params(logistic.get_params()) # Parameters grid to try param_grid = { 'pca__n_components': [2, 3], 'logistic__alpha': np.logspace(-4, 4, 5), } mlflow.log_params(param_grid) # Define training pipeline pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)]) # Training logging.info("beginning training") search = GridSearchCV(pipe, param_grid, iid=False, cv=3, return_train_score=False) search.fit(X, y) print(f"Best parameter (CV score={search.best_score_}):") print(search.best_params_) mlflow.log_params(search.best_params_) mlflow.log_metric("best_score", search.best_score_) # Save best model logging.info("saving best model") dump(search.best_estimator_, output_path) mlflow.log_artifact(output_path)
class PlattScaledSVM(BaseEstimator, ClassifierMixin): def __init__(self, **svm_kwargs): self.svm_kwargs = svm_kwargs self.svm = SGDClassifier(loss="hinge", **self.svm_kwargs) self.lr = LogisticRegression() def __str__(self): param_str = ', '.join([ "{0}={1}".format(k, v) for (k, v) in self.svm.get_params().items() ]) return "PlattScaledSVM({})".format(param_str) def __repr__(self): param_str = ', '.join([ "{0}={1}".format(k, v) for (k, v) in self.svm.get_params().items() ]) return "PlattScaledSVM({})".format(param_str) def fit(self, X, y): self.svm.fit(X, y) dists = self.svm.decision_function(X) self.lr.fit(dists.reshape(-1, 1), y) return self def predict(self, X, y=None): dists = self.svm.decision_function(X) preds = self.lr.predict(dists.reshape(-1, 1)) def predict_proba(self, X, y=None): dists = self.svm.decision_function(X) probs = self.lr.predict_proba(dists.reshape(-1, 1)) return probs def get_params(self, deep=True): return self.svm_kwargs def set_params(self, **parameters): for parameter, value in parameters.items(): self.setattr(parameter, value) return self
def SVM(train_bow_tf_idf, train_labels, bow_test_tf_idf, test_labels): # training the support vector machine (SVM) model. Linear classifiers (SVM) with SGD training model = SGDClassifier(loss='squared_hinge', average=100, penalty='l2', alpha=0.0001, random_state=None, max_iter=100, tol=None, n_jobs=-1) model.fit(train_bow_tf_idf, train_labels) print() print('------- Support Vector Machine (SVM) -------') # evaluate the model print('Default hyperparameters:') print(model.get_params()) train_pred = model.predict(train_bow_tf_idf) print('SVM train accuracy = {}'.format( (train_pred == train_labels).mean())) test_pred = model.predict(bow_test_tf_idf) print('SVM test accuracy = {}'.format((test_pred == test_labels).mean())) # # gridsearch for best Hyperparameter # parameters = {'alpha': (1, 0.1, 0.01, 0.001, 0.0001 ), # 'loss': ('squared_hinge', 'hinge' ) # } # gs_clf = GridSearchCV(model, parameters, n_jobs=-1) # gs_clf = gs_clf.fit(train_bow_tf_idf, train_data.target) # # best_parameters = gs_clf.best_estimator_.get_params() # print('Best params using gridSearch:') # print(best_parameters) # gstrain_pred = gs_clf.predict(train_bow_tf_idf) # print('New hyperparameters SVM train accuracy = {}'.format((gstrain_pred == train_labels).mean())) # gstest_pred = gs_clf.predict(bow_test_tf_idf) # print('New hyperparameters SVM test accuracy = {}'.format((gstest_pred == test_labels).mean())) # print('---------------------------------------') # print() return model, test_pred
def get_sgd(x_train, t_train, x_val, t_val, search=False): # {'alpha': 0.1, 'loss': 'hinge', 'penalty': 'l2'} # params {'alpha': 0.1, 'loss': 'squared_hinge', 'penalty': 'l2'} # sgd validated at (array([0.83904737, 0.77597488, 0.63281863, 0.64005236, 0.78926702]), 0.5431523356769534) # SGD tested at (array([0.5211474 , 0.75460637, 0.42106365, 0.84335079, 0.82848168]), 0.5295225644352521) if search: sgd_params = param_sel( x_train, t_train, SGDClassifier(max_iter=2000), { 'alpha': [0.01, 0.06, 0.1, 0.6, 1], 'loss': ['hinge', 'log', 'squared_hinge', 'modified_huber'], 'penalty': ['l2'] }) else: sgd_params = {'alpha': 0.1, 'loss': 'squared_hinge', 'penalty': 'l2'} sgd_classifier = SGDClassifier(**sgd_params, max_iter=2000) sgd_classifier.fit(x_train, t_train) print("SGD params:", sgd_classifier.get_params()) print("SGD validated at", validate(sgd_classifier, x_val, t_val)) return sgd_classifier
def train_and_predict(train_X, train_y, test_X, test_y, coco, prefix): """ Trains an SVM classifier using the given training dataset. Then, makes predictions with the given test dataset. """ # train sgd_clf = SGDClassifier(random_state=42, max_iter=1000, tol=1e-3) sgd_clf.fit(train_X, train_y) print(sgd_clf.get_params(deep=True)) # test print("Predicting with trained SVM...") y_predictions = sgd_clf.predict(test_X) print('Percentage correct: ', 100 * np.sum(y_predictions == test_y) / len(test_y)) # plot confusion matrix unique_labels = np.unique(test_y + list(y_predictions)) categories = coco.loadCats(unique_labels) category_names = [cat["name"] for cat in categories] build_confusion_matrix(sgd_clf, test_X, test_y, category_names,\ "{0}/{1}_confusion_matrix.png".format(results_folder, prefix))
X_train, X_test, y_train, y_test = train_test_split(TempTrain, TrainTransaction['isFraud'], test_size=0.1, random_state=42) #Set up SDG Model SDGModel=SGDClassifier(loss="log", penalty="l2", max_iter=1000) SDGModel.fit(X_train, y_train) #Predict values PredictedValues=SDGModel.predict(X_test) #Metrics print(confusion_matrix(y_test, PredictedValues)) print(classification_report(y_test, PredictedValues)) #Save Parameters text_file = open("Params_V3.txt", "w") text_file.write("%s\n" % SDGModel.get_params()) text_file.write("%s\n" % confusion_matrix(y_test, PredictedValues)) text_file.write("%s\n" % classification_report(y_test, PredictedValues)) text_file.close() #Try with test TestSet_dev=pd.read_csv(zip.ZipFile('Data/test_transaction.csv.zip').open("test_transaction.csv")) X_test_dev=TestSet_dev[np.concatenate((["C"+str(X) for X in [1,2,3,5,6,7,11,12]],["D"+str(X) for X in range(1,16)]))] X_test_dev.shape X_test_dev.dropna().shape X_test_dev.fillna(value=0, inplace=True) ################## #Submit predictions PredictedValues_Dev=SDGModel.predict(X_test_dev)
class RBFSVMLearner(learners.BaseLearner): def __init__(self, loss="hinge", penalty='l2', alpha=1e-9, l1_ratio=0, fit_intercept=True, max_iter=MAX_ITER, tol=None, shuffle=True, verbose=False, epsilon=stochastic_gradient.DEFAULT_EPSILON, n_jobs=1, random_state=None, learning_rate="optimal", eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, average=False, n_iter=2000, gamma_frac=0.1, use_linear=False): super().__init__(verbose) self._alpha = alpha self._gamma_frac = gamma_frac self._n_iter = n_iter self._use_linear = use_linear self._learner = SGDClassifier(loss=loss, penalty=penalty, alpha=self._alpha, l1_ratio=l1_ratio, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, shuffle=shuffle, verbose=verbose, epsilon=epsilon, n_jobs=n_jobs, average=average, learning_rate=learning_rate, eta0=eta0, power_t=power_t, class_weight=class_weight, warm_start=warm_start, n_iter=self._n_iter, random_state=random_state) self.gamma = None self.X_ = None self.classes_ = None self.kernels_ = None self.y_ = None def learner(self): return self._learner def fit(self, training_data, classes): if self._use_linear: return self._learner.fit(training_data, classes) # Check that training_data, classes training_data, classes = check_X_y(training_data, classes) # Get the kernel matrix dist = euclidean_distances(training_data, squared=True) median = np.median(dist) del dist gamma = median gamma *= self._gamma_frac self.gamma = 1 / gamma kernels = rbf_kernel(training_data, None, self.gamma) self.X_ = training_data self.classes_ = unique_labels(classes) self.kernels_ = kernels self.y_ = classes self._learner.fit(self.kernels_, self.y_) # Return the classifier return self def predict(self, data): if self._use_linear: return self._learner.predict(data) # Check is fit had been called check_is_fitted(self, ['X_', 'y_', '_learner', 'kernels_']) # Input validation data = check_array(data) new_kernels = rbf_kernel(data, self.X_, self.gamma) pred = self._learner.predict(new_kernels) return pred # We pass gamma_frac around def get_params(self, deep=True): """ Get the current parameters for the learner. This passes the call back to the learner from learner() :param deep: If true, fetch deeply :return: The parameters """ extra_params = { 'gamma_frac': self._gamma_frac, 'use_linear': self._use_linear } params = self._learner.get_params(deep) return {k: v for d in (params, extra_params) for k, v in d.items()} def set_params(self, **params): """ Set the current parameters for the learner. This passes the call back to the learner from learner() :param params: The params to set :return: self """ if 'gamma_frac' in params: self._gamma_frac = params.pop('gamma_frac', None) if 'use_linear' in params: self._use_linear = params.pop('use_linear', None) return self._learner.set_params(**params)
class MiniBatchSGD(BaseEstimator): def __init__(self, verbose=False, test=None, batch_size=100, *args, **kwargs): self.classifier = SGDClassifier(*args, **kwargs) self.verbose = verbose self.test = test self.batch_size = batch_size def fit(self, X, y): indices = np.arange(X.shape[0]) num_batches = int(1. * X.shape[0] / self.batch_size) + ( (X.shape[0] % self.batch_size) != 0) classes = np.unique(y) for i in range(self.classifier.n_iter): if self.verbose: print('epoch {}'.format(i)) np.random.shuffle(indices) for j in range(num_batches): X_batch = X[indices[j * self.batch_size:(j + 1) * self.batch_size]] y_batch = y[indices[j * self.batch_size:(j + 1) * self.batch_size]] start = time.time() self.classifier.partial_fit(X, y, classes) y_pred = self.classifier.predict(X) p, r, f, s = precision_recall_fscore_support(y, y_pred) cost = f if self.verbose: print("epoch: {} batch: {} cost: {} time: {}".format( i, j, cost, time.time() - start)) if j % 10 == 0 and self.verbose and self.test is not None: for index in range(len(self.test)): y_pred = self.classifier.predict(self.test[index][0]) p, r, f, s = precision_recall_fscore_support( self.test[index][1], y_pred) print("precision: {} recall: {} ".format(p, r)) def predict(self, X): return self.classifier.predict(X) def decision_function(self, X): return self.classifier.decision_function(X) def get_params(self): params = self.classifier.get_params() params.update({ 'verbose': self.verbose, 'test': self.test, 'batch_size': self.batch_size }) return params
from sklearn.linear_model import SGDClassifier linear_name = 'SGDClassifier' linear_params_grid = { 'alpha': [0.0001], 'loss': ['log'], # 'loss': ['hinge', 'log'], 'max_iter': [1000], 'tol': [0.001, 0.0001, 0.01, 0.1] } linear = SGDClassifier(random_state=42) if __name__ == "__main__": print(linear.get_params())
run_name=f"run_{experiment_name}"): #-------Load data -----------# iris = pd.read_csv(input_data_path) X = iris.drop("Species", axis=1) y = iris.Species #-------Define model and parameters----------# pca = PCA() logistic = SGDClassifier(loss='log', penalty='l2', max_iter=200, tol=1e-3, random_state=0) logistic.get_params() param_grid = { 'pca__n_components': [2], 'logistic__alpha': np.logspace(-2, 1, 2), } mlflow.log_params(param_grid) pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)]) #--------Training ----------# logging.info("beginning training") search = GridSearchCV(pipe, param_grid, cv=2, return_train_score=False) search.fit(X, y) logging.info(f"Best parameter (CV score={search.best_score_}):") best_param_renamed = {