def iterative_fit(self, X, y, n_iter=1, refit=False): from sklearn.linear_model.passive_aggressive import \ PassiveAggressiveClassifier if refit: self.estimator = None if self.estimator is None: self._iterations = 0 self.estimator = PassiveAggressiveClassifier( C=self.C, fit_intercept=self.fit_intercept, n_iter=1, loss=self.loss, shuffle=True, random_state=self.random_state, warm_start=True) self.classes_ = np.unique(y.astype(int)) # Fallback for multilabel classification if len(y.shape) > 1 and y.shape[1] > 1: import sklearn.multiclass self.estimator.n_iter = self.n_iter self.estimator = sklearn.multiclass.OneVsRestClassifier( self.estimator, n_jobs=1) self.estimator.fit(X, y) self.fully_fit_ = True else: # In the first iteration, there is not yet an intercept self.estimator.n_iter = n_iter self.estimator.partial_fit(X, y, classes=np.unique(y)) if self._iterations >= self.n_iter: self.fully_fit_ = True self._iterations += n_iter return self
def PassiveAggressive_classify(params, dataset, seed, classify): model_name = "PassiveAggressive" print(model_name, params, dataset, seed) np.random.seed(108) start_time = timeit.default_timer() train_X, train_y, test_X, test_y = gen_train_test_data(dataset, seed) # build a classifier based on selected parameters # C = UniformFloatHyperparameter("C", 1e-5, 10, 1.0, log=True) model = PassiveAggressiveClassifier(C=np.exp(params["C"]), max_iter=1000, tol=1e-3, random_state=108) if classify == "test": model.fit(train_X, train_y) pred_y = model.predict(test_X) # maximize accuracy auc = accuracy_score(test_y, pred_y) if classify == "cv": scores = cross_val_score(model, train_X, train_y, cv=cv_train) auc = np.mean(scores) # minimize loss loss = 1.0 - auc end_time = timeit.default_timer() print("{}_runtime: {}(s)".format(model_name, round(end_time - start_time, 2))) del model # dictionary with information for evaluation return {'auc': auc, 'loss': loss, 'status': STATUS_OK}
def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
class PassiveAggressiveClassifierImpl(): def __init__(self, C=1.0, fit_intercept=True, max_iter=None, tol=None, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, shuffle=True, verbose=0, loss='hinge', n_jobs=None, random_state=None, warm_start=False, class_weight='balanced', average=False, n_iter=None): self._hyperparams = { 'C': C, 'fit_intercept': fit_intercept, 'max_iter': max_iter, 'tol': tol, 'early_stopping': early_stopping, 'validation_fraction': validation_fraction, 'n_iter_no_change': n_iter_no_change, 'shuffle': shuffle, 'verbose': verbose, 'loss': loss, 'n_jobs': n_jobs, 'random_state': random_state, 'warm_start': warm_start, 'class_weight': class_weight, 'average': average, 'n_iter': n_iter } def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X)
def model_translated_english(): ''' The model + pipeline for features extracted from the translated text (Spanish to English) ''' clfs = [ LinearSVC(), svm.SVC(kernel='linear', C=1.0), PassiveAggressiveClassifier(max_iter=250, tol=1e-3) ] classifier = Pipeline([ # Extract the features ('features', FeaturesExtractor()), # Use FeatureUnion to combine the features from subject and body ( 'union', FeatureUnion( transformer_list=[ # Pipeline bag-of-words model ( 'words', Pipeline([ ('selector', ItemSelector(key='text_translated')), ('tfidf', TfidfVectorizer(preprocessor=identity, tokenizer=identity, max_df=.2)), ('chi-square', SelectKBest(chi2, 300)), #3000)), ])), # Pipeline for high info words bag-of-words model ('text_high', Pipeline([ ('selector', ItemSelector(key='text_translated_high')), ('tfidf', TfidfVectorizer(preprocessor=identity, tokenizer=identity, max_df=.2)), ])), ('char_n_grams', Pipeline([('selector', ItemSelector(key='text_ngram_translated')), ('tfidf', TfidfVectorizer(analyzer='char', ngram_range=(2, 6)))])), ('word_n_grams', Pipeline([('selector', ItemSelector(key='text_ngram_translated')), ('tfidf', TfidfVectorizer(analyzer='word', ngram_range=(1, 3)))])), ], )), # Use a classifier on the combined features ('clf', clfs[2]), ]) return classifier
def demo(output_file=None, instances=40000): """ _test_prequential This demo shows how to produce a prequential evaluation. The first thing needed is a stream. For this case we use a file stream which gets its samples from the sea_big.csv file, inside the datasets folder. Then we need to setup a classifier, which in this case is an instance of sklearn's PassiveAggressiveClassifier. Then, optionally we create a pipeline structure, initialized on that classifier. The evaluation is then run. Parameters ---------- output_file: string The name of the csv output file instances: int The evaluation's max number of instances """ # Setup the File Stream #opt = FileOption("FILE", "OPT_NAME", "../datasets/covtype.csv", "CSV", False) opt = FileOption("FILE", "OPT_NAME", "../datasets/sea_big.csv", "CSV", False) stream = FileStream(opt, -1, 1) #stream = WaveformGenerator() stream.prepare_for_use() # Setup the classifier #classifier = SGDClassifier() # classifier = KNNAdwin(k=8, max_window_size=2000,leaf_size=40, categorical_list=None) #classifier = OzaBaggingAdwin(h=KNN(k=8, max_window_size=2000, leaf_size=30, categorical_list=None)) classifier = PassiveAggressiveClassifier() #classifier = SGDRegressor() #classifier = PerceptronMask() # Setup the pipeline pipe = Pipeline([('Classifier', classifier)]) # Setup the evaluator eval = EvaluatePrequential( pretrain_size=200, max_instances=instances, batch_size=1, n_wait=100, max_time=1000, output_file=output_file, task_type='classification', show_plot=True, plot_options=['kappa', 'kappa_t', 'performance']) # Evaluate eval.eval(stream=stream, classifier=pipe)
def iterative_fit(self, X, y, n_iter=1, refit=False): if refit: self.estimator = None if self.estimator is None: self.estimator = PassiveAggressiveClassifier( C=self.C, fit_intercept=self.fit_intercept, n_iter=1, loss=self.loss, shuffle=True, random_state=self.random_state, warm_start=True) self.classes_ = np.unique(y.astype(int)) self.estimator.n_iter += n_iter self.estimator.fit(X, y) return self
def demo(output_file=None, instances=40000): """ _test_prequential This demo shows how to produce a prequential evaluation. The first thing needed is a stream. For this case we use a file stream which gets its samples from the sea_big.csv file. Then we need to setup a classifier, which in this case is an instance of sklearn's PassiveAggressiveClassifier. Then, optionally we create a pipeline structure, initialized on that classifier. The evaluation is then run. Parameters ---------- output_file: string The name of the csv output file instances: int The evaluation's max number of instances """ # Setup the File Stream # stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/" # "master/sea_big.csv") # stream = WaveformGenerator() # Setup the classifier # classifier = SGDClassifier() # classifier = KNNADWINClassifier(n_neighbors=8, max_window_size=2000,leaf_size=40, nominal_attributes=None) # classifier = OzaBaggingADWINClassifier(base_estimator=KNNClassifier(n_neighbors=8, max_window_size=2000, # leaf_size=30)) classifier = PassiveAggressiveClassifier() # classifier = SGDRegressor() # classifier = PerceptronMask() # Setup the pipeline pipe = Pipeline([('Classifier', classifier)]) # Setup the evaluator evaluator = EvaluatePrequential( pretrain_size=200, max_samples=instances, batch_size=1, n_wait=100, max_time=1000, output_file=output_file, show_plot=True, metrics=['kappa', 'kappa_t', 'performance']) # Evaluate evaluator.evaluate(stream=stream, model=pipe)
def __init__(self, C=1.0, fit_intercept=True, max_iter=None, tol=None, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, shuffle=True, verbose=0, loss='hinge', n_jobs=None, random_state=None, warm_start=False, class_weight='balanced', average=False, n_iter=None): self._hyperparams = { 'C': C, 'fit_intercept': fit_intercept, 'max_iter': max_iter, 'tol': tol, 'early_stopping': early_stopping, 'validation_fraction': validation_fraction, 'n_iter_no_change': n_iter_no_change, 'shuffle': shuffle, 'verbose': verbose, 'loss': loss, 'n_jobs': n_jobs, 'random_state': random_state, 'warm_start': warm_start, 'class_weight': class_weight, 'average': average, 'n_iter': n_iter} self._wrapped_model = SKLModel(**self._hyperparams)
def model_title(): ''' The model + pipeline for features extracted from the title ''' clfs = [ LinearSVC(), svm.SVC(kernel='linear', C=1.0), PassiveAggressiveClassifier() ] classifier = Pipeline([ # Extract the features ('features', FeaturesExtractor()), # Use FeatureUnion to combine the features from subject and body ( 'union', FeatureUnion( transformer_list=[ # Pipeline for title words ('title', Pipeline([ ('selector', ItemSelector(key='title')), ('tfidf', TfidfVectorizer(preprocessor=identity, tokenizer=identity)), ])), ], # weight components in FeatureUnion transformer_weights={ # 'title': .3, }, )), # Use a classifier on the combined features ('clf', clfs[2]), ]) return classifier
'LDA': (True, LinearDiscriminantAnalysis(solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001) ), 'LogisticRegression': (True, LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='warn', max_iter=100, multi_class='warn', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None) ), 'CalibratedClassifierCV': (True, CalibratedClassifierCV(base_estimator=None, method='sigmoid', cv='warn') ), 'LinearSVC': (True, LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000) ), 'LinearSVM': ( True, SVC(kernel='linear', C=0.025) ), # (C=0.01, penalty='l1', dual=False) ), 'RBF_SVM': (True, SVC(gamma='auto') ),#gamma=2, C=1) ), # 'Nu_SVM': (True, NuSVC(gamma='auto') ), 'GaussianProcess': (False, GaussianProcessClassifier() ), #(1.0 * RBF(1.0)) ), 'NeuralNet': (True, MLPClassifier(alpha=1, max_iter=1000) ), 'QDA': (True, QuadraticDiscriminantAnalysis() ), 'NaiveBayes': (True, GaussianNB() ), 'RadiusNeighborsClassifier': (True, RadiusNeighborsClassifier() ), 'SGDClassifier': (True, SGDClassifier() ), 'RidgeClassifierCV': (True, RidgeClassifierCV() ), 'RidgeClassifier': (True, RidgeClassifier() ), 'PassiveAggressiveClassifier': (True, PassiveAggressiveClassifier() ), 'LabelPropagation': (True, LabelPropagation() ), 'LabelSpreading': (False, LabelSpreading() ), 'MultinomialNB': (True, MultinomialNB() ), 'NearestCentroid': (True, NearestCentroid() ), 'Perceptron': (True, Perceptron() ), } # feature_set is used for manually enabling the individual features. # NOTE: setting boolean value, eanbles/disables feature. feature_set = { 'backers_count': True, 'converted_pledged_amount': True, 'goal': True, 'country': True,
class PassiveAggressive(ParamSklearnClassificationAlgorithm): def __init__(self, C, fit_intercept, n_iter, loss, random_state=None): self.C = float(C) self.fit_intercept = fit_intercept == 'True' self.n_iter = int(n_iter) self.loss = loss self.random_state = random_state self.estimator = None def fit(self, X, y): while not self.configuration_fully_fitted(): self.iterative_fit(X, y, n_iter=1) return self def iterative_fit(self, X, y, n_iter=1, refit=False): if refit: self.estimator = None if self.estimator is None: self.estimator = PassiveAggressiveClassifier( C=self.C, fit_intercept=self.fit_intercept, n_iter=1, loss=self.loss, shuffle=True, random_state=self.random_state, warm_start=True) self.classes_ = np.unique(y.astype(int)) self.estimator.n_iter += n_iter self.estimator.fit(X, y) return self def configuration_fully_fitted(self): if self.estimator is None: return False return not self.estimator.n_iter < self.n_iter def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(dataset_properties=None): return {'shortname': 'PassiveAggressive Classifier', 'name': 'Passive Aggressive Stochastic Gradient Descent ' 'Classifier', 'handles_missing_values': False, 'handles_nominal_values': False, 'handles_numerical_features': True, 'prefers_data_scaled': True, 'prefers_data_normalized': True, 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, 'handles_sparse': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS,), # TODO find out what is best used here! 'preferred_dtype': None} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): loss = CategoricalHyperparameter("loss", ["hinge", "squared_hinge"], default="hinge") fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") n_iter = UniformIntegerHyperparameter("n_iter", 5, 1000, default=20) C = UniformFloatHyperparameter("C", 1e-5, 10, 1, log=True) cs = ConfigurationSpace() cs.add_hyperparameter(loss) cs.add_hyperparameter(fit_intercept) cs.add_hyperparameter(n_iter) cs.add_hyperparameter(C) return cs
class PassiveAggressive( IterativeComponentWithSampleWeight, BaseClassificationModel, ): def __init__(self, C, fit_intercept, tol, loss, average, random_state=None): self.C = C self.fit_intercept = fit_intercept self.average = average self.tol = tol self.loss = loss self.random_state = random_state self.estimator = None self.time_limit = None self.start_time = time.time() def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): from sklearn.linear_model.passive_aggressive import \ PassiveAggressiveClassifier # Need to fit at least two iterations, otherwise early stopping will not # work because we cannot determine whether the algorithm actually # converged. The only way of finding this out is if the sgd spends less # iterations than max_iter. If max_iter == 1, it has to spend at least # one iteration and will always spend at least one iteration, so we # cannot know about convergence. if refit: self.estimator = None if self.estimator is None: self.fully_fit_ = False self.average = check_for_bool(self.average) self.fit_intercept = check_for_bool(self.fit_intercept) self.tol = float(self.tol) self.C = float(self.C) call_fit = True self.estimator = PassiveAggressiveClassifier( C=self.C, fit_intercept=self.fit_intercept, max_iter=n_iter, tol=self.tol, loss=self.loss, shuffle=True, random_state=self.random_state, warm_start=True, average=self.average, ) self.classes_ = np.unique(y.astype(int)) else: call_fit = False # Fallback for multilabel classification if len(y.shape) > 1 and y.shape[1] > 1: import sklearn.multiclass self.estimator.max_iter = 50 self.estimator = sklearn.multiclass.OneVsRestClassifier( self.estimator, n_jobs=1) self.estimator.fit(X, y) self.fully_fit_ = True else: if call_fit: self.estimator.fit(X, y) else: self.estimator.max_iter += n_iter self.estimator.max_iter = min(self.estimator.max_iter, 1000) self.estimator._validate_params() lr = "pa1" if self.estimator.loss == "hinge" else "pa2" self.estimator._partial_fit(X, y, alpha=1.0, C=self.estimator.C, loss="hinge", learning_rate=lr, max_iter=n_iter, classes=None, sample_weight=sample_weight, coef_init=None, intercept_init=None) if (self.estimator.max_iter >= 1000 or n_iter > self.estimator.n_iter_): self.fully_fit_ = True return self def configuration_fully_fitted(self): if self.estimator is None: return False elif not hasattr(self, 'fully_fit_'): return False else: return self.fully_fit_ def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(dataset_properties=None): return { 'shortname': 'PassiveAggressive Classifier', 'name': 'Passive Aggressive Classifier', 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'is_deterministic': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS, ) } @staticmethod def get_hyperparameter_search_space(dataset_properties=None, optimizer='smac'): if optimizer == 'smac': C = UniformFloatHyperparameter("C", 1e-5, 10, 1.0, log=True) fit_intercept = UnParametrizedHyperparameter( "fit_intercept", "True") loss = CategoricalHyperparameter("loss", ["hinge", "squared_hinge"], default_value="hinge") tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, default_value=1e-4, log=True) # Note: Average could also be an Integer if > 1 average = CategoricalHyperparameter('average', ['False', 'True'], default_value='False') cs = ConfigurationSpace() cs.add_hyperparameters([loss, fit_intercept, tol, C, average]) return cs elif optimizer == 'tpe': space = { 'C': hp.loguniform("pa_C", np.log(1e-5), np.log(10)), 'fit_intercept': hp.choice('pa_fit_intercept', ["True"]), 'loss': hp.choice('pr_loss', ["hinge", "squared_hinge"]), 'tol': hp.loguniform('pr_tol', np.log(1e-5), np.log(1e-1)), 'average': hp.choice('pr_average', ["False", "True"]) } init_trial = { 'C': 1, 'fit_intercept': "True", 'loss': "hinge", 'tol': 1e-4, 'average': "False" } return space
def model_words(): ''' The model + pipeline for features extracted from the text ''' clfs = [ LinearSVC(), svm.SVC(kernel='linear', C=1.0), PassiveAggressiveClassifier(C=1, max_iter=1000, tol=1e-3, n_jobs=-1, class_weight="balanced"), PassiveAggressiveClassifier(C=0.1, max_iter=1500, tol=0.01, n_jobs=-1, class_weight="balanced", fit_intercept=False, loss="squared_hinge"), AdaBoostClassifier(n_estimators=200), MultinomialNB(), ] classifier = Pipeline([ # Extract the features ('features', FeaturesExtractor()), # Use FeatureUnion to combine the features from subject and body ( 'union', FeatureUnion( transformer_list=[ ('text_high', Pipeline([ ('selector', ItemSelector(key='text_high')), ('tfidf', TfidfVectorizer(preprocessor=identity, tokenizer=identity, max_df=.2)), ])), ('word_n_grams', Pipeline([('selector', ItemSelector(key='sentence')), ('tfidf', TfidfVectorizer(analyzer='word', ngram_range=(1, 5)))])), ('char_n_grams', Pipeline([('selector', ItemSelector(key='sentence')), ('tfidf', TfidfVectorizer(analyzer='char', ngram_range=(2, 5)))])), ('sentiment', Pipeline([('selector', ItemSelector(key='sentiment')), ('tfidf', TfidfVectorizer(analyzer='char'))])), ('opinion_towards', Pipeline([ ('selector', ItemSelector(key='opinion')), ])), ('target', Pipeline([ ('selector', ItemSelector(key='target')), ])), #### FEATURES THAT DO NOT WORK #### # ('sentiment_cont', Pipeline([ # ('selector', ItemSelector(key='sentence')), # ('feature', SentimentContinuous()) # ])), # ('glove', Pipeline([ # ('selector', ItemSelector(key='sentence')), # ('tfidf', TfidfEmbeddingVectorizer(glove)) # ])), # ('sentence_length', Pipeline([ # ('selector', ItemSelector(key='sentence_length')), # ('scaler', MinMaxScaler()) # ])), ], # weight components in FeatureUnion transformer_weights={ 'text_high': 1, 'word_n_grams': .8, 'char_n_grams': .8, 'sentiment': .8, 'opinion_towards': 1, 'target': 1, }, )), # Use a classifier on the combined features ('clf', clfs[2]), ]) return classifier
def model_words(): ''' The model + pipeline for features extracted from the text ''' clfs = [ LinearSVC(), svm.SVC(kernel='linear', C=1.0), PassiveAggressiveClassifier(max_iter=250, tol=1e-3), PassiveAggressiveClassifier(C=0.001, class_weight="balanced", fit_intercept=False, loss="squared_hinge", max_iter=7500) ] classifier = Pipeline([ # Extract the features ('features', FeaturesExtractor()), # Use FeatureUnion to combine the features from subject and body ( 'union', FeatureUnion( transformer_list=[ # Pipeline bag-of-words model ( 'words', Pipeline([ ('selector', ItemSelector(key='text')), ('tfidf', TfidfVectorizer(preprocessor=identity, tokenizer=identity, max_df=.2)), ('chi-square', SelectKBest(chi2, 300)), #3000)), ])), # Pipeline for high info words bag-of-words model ('text_high', Pipeline([ ('selector', ItemSelector(key='text_high')), ('tfidf', TfidfVectorizer(preprocessor=identity, tokenizer=identity, max_df=.2)), ])), # ('char_n_grams', Pipeline([ # ('selector', ItemSelector(key='text_ngram')), # ('tfidf', TfidfVectorizer(analyzer='char', ngram_range=(2,6))) # ])), ('word_n_grams', Pipeline([('selector', ItemSelector(key='text_ngram')), ('tfidf', TfidfVectorizer(analyzer='word', ngram_range=(1, 3)))])), # ('sentiment_cont', Pipeline([ # ('selector', ItemSelector(key='text_ngram')), # ('feature', SentimentContinuous()) # ])), ], )), # Use a classifier on the combined features ('clf', clfs[3]), ]) return classifier
def iterative_fit(self, X, y, n_iter=2, refit=False): from sklearn.linear_model.passive_aggressive import \ PassiveAggressiveClassifier # Need to fit at least two iterations, otherwise early stopping will not # work because we cannot determine whether the algorithm actually # converged. The only way of finding this out is if the sgd spends less # iterations than max_iter. If max_iter == 1, it has to spend at least # one iteration and will always spend at least one iteration, so we # cannot know about convergence. if refit: self.estimator = None if self.estimator is None: call_fit = True self.estimator = PassiveAggressiveClassifier( C=self.C, fit_intercept=self.fit_intercept, max_iter=n_iter, tol=self.tol, loss=self.loss, shuffle=True, random_state=self.random_state, warm_start=True, average=self.average, ) self.classes_ = np.unique(y.astype(int)) else: call_fit = False # Fallback for multilabel classification if len(y.shape) > 1 and y.shape[1] > 1: import sklearn.multiclass self.estimator.max_iter = 50 self.estimator = sklearn.multiclass.OneVsRestClassifier( self.estimator, n_jobs=1) self.estimator.fit(X, y) self.fully_fit_ = True else: if call_fit: self.estimator.fit(X, y) else: self.estimator.max_iter += n_iter self.estimator.max_iter = min(self.estimator.max_iter, 1000) self.estimator._validate_params() lr = "pa1" if self.estimator.loss == "hinge" else "pa2" self.estimator._partial_fit(X, y, alpha=1.0, C=self.estimator.C, loss="hinge", learning_rate=lr, max_iter=n_iter, classes=None, sample_weight=None, coef_init=None, intercept_init=None) if (self.estimator._max_iter >= 1000 or n_iter > self.estimator.n_iter_): self.fully_fit_ = True return self
class PassiveAggressive( IterativeComponentWithSampleWeight, AutoSklearnClassificationAlgorithm, ): def __init__(self, C, fit_intercept, tol, loss, average, random_state=None): self.C = C self.fit_intercept = fit_intercept self.average = average self.tol = tol self.loss = loss self.random_state = random_state self.estimator = None def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): from sklearn.linear_model.passive_aggressive import \ PassiveAggressiveClassifier # Need to fit at least two iterations, otherwise early stopping will not # work because we cannot determine whether the algorithm actually # converged. The only way of finding this out is if the sgd spends less # iterations than max_iter. If max_iter == 1, it has to spend at least # one iteration and will always spend at least one iteration, so we # cannot know about convergence. if refit: self.estimator = None if self.estimator is None: self.fully_fit_ = False self.average = check_for_bool(self.average) self.fit_intercept = check_for_bool(self.fit_intercept) self.tol = float(self.tol) self.C = float(self.C) call_fit = True self.estimator = PassiveAggressiveClassifier( C=self.C, fit_intercept=self.fit_intercept, max_iter=n_iter, tol=self.tol, loss=self.loss, shuffle=True, random_state=self.random_state, warm_start=True, average=self.average, ) self.classes_ = np.unique(y.astype(int)) else: call_fit = False # Fallback for multilabel classification if len(y.shape) > 1 and y.shape[1] > 1: import sklearn.multiclass self.estimator.max_iter = 50 self.estimator = sklearn.multiclass.OneVsRestClassifier( self.estimator, n_jobs=1) self.estimator.fit(X, y) self.fully_fit_ = True else: if call_fit: self.estimator.fit(X, y) else: self.estimator.max_iter += n_iter self.estimator.max_iter = min(self.estimator.max_iter, 1000) self.estimator._validate_params() lr = "pa1" if self.estimator.loss == "hinge" else "pa2" self.estimator._partial_fit( X, y, alpha=1.0, C=self.estimator.C, loss="hinge", learning_rate=lr, max_iter=n_iter, classes=None, sample_weight=sample_weight, coef_init=None, intercept_init=None ) if ( self.estimator._max_iter >= 1000 or n_iter > self.estimator.n_iter_ ): self.fully_fit_ = True return self def configuration_fully_fitted(self): if self.estimator is None: return False elif not hasattr(self, 'fully_fit_'): return False else: return self.fully_fit_ def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(dataset_properties=None): return {'shortname': 'PassiveAggressive Classifier', 'name': 'Passive Aggressive Classifier', 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'is_deterministic': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): C = UniformFloatHyperparameter("C", 1e-5, 10, 1.0, log=True) fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") loss = CategoricalHyperparameter( "loss", ["hinge", "squared_hinge"], default_value="hinge" ) tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, default_value=1e-4, log=True) # Note: Average could also be an Integer if > 1 average = CategoricalHyperparameter('average', ['False', 'True'], default_value='False') cs = ConfigurationSpace() cs.add_hyperparameters([loss, fit_intercept, tol, C, average]) return cs
def __get_classifier_model(classifier, args): """ Convenience function for obtaining a classification model Args: classifier(str): A string indicating the name of the classifier args: An arguments object Returns: A classification model based on the given classifier string """ # Make SGD Logistic Regression model the default model = SGDClassifier(loss='log', penalty='l2', shuffle=True, n_iter=5, n_jobs=-1, random_state=179) if classifier == SVM: model = SVC(kernel=args.kernel, class_weight="balanced", cache_size=8096, random_state=17, probability=True) elif classifier == ADA_BOOST: dt = DecisionTreeClassifier(max_depth=15, criterion='gini', max_features='auto', class_weight='balanced', random_state=39) model = AdaBoostClassifier(base_estimator=dt, n_estimators=400, random_state=17) elif classifier == RF: # Configure the classifier to use all available CPU cores model = RandomForestClassifier(class_weight="balanced", n_jobs=-1, n_estimators=400, random_state=17, max_features='auto', max_depth=15, criterion='gini') elif classifier == GRADIENT_BOOST: model = GradientBoostingClassifier(random_state=17, n_estimators=400, max_features='auto') elif classifier == EXTRA_TREES: model = ExtraTreesClassifier(random_state=17, n_estimators=400, n_jobs=-1, class_weight='balanced', max_depth=15, max_features='auto', criterion='gini') elif classifier == BAGGING: dt = DecisionTreeClassifier(max_depth=15, criterion='gini', max_features='auto', class_weight='balanced', random_state=39) model = BaggingClassifier(base_estimator=dt, n_estimators=400, random_state=17, n_jobs=-1, max_features=0.8, max_samples=0.8, bootstrap=False) elif classifier == PASSIVE_AGGRESSIVE: model = PassiveAggressiveClassifier(n_iter=10, class_weight='balanced', n_jobs=-1, random_state=41) elif classifier == PERCEPTRON: model = Perceptron(n_jobs=-1, n_iter=10, penalty='l2', class_weight='balanced', alpha=0.25) return model
def model_words(): ''' The model + pipeline for features extracted from the text ''' clfs = [ LinearSVC(), svm.SVC(kernel='linear', C=1.0), PassiveAggressiveClassifier() ] classifier = Pipeline([ # Extract the features ('features', FeaturesExtractor()), # Use FeatureUnion to combine the features from subject and body ( 'union', FeatureUnion( #n_jobs = -1, transformer_list=[ # Pipeline bag-of-words model ('words', Pipeline([ ('selector', ItemSelector(key='text')), ('tfidf', TfidfVectorizer(preprocessor=identity, tokenizer=identity, max_df=.2)), ('chi-square', SelectKBest(chi2, 3000)), ])), # # Pipeline for character features # ('chars', Pipeline([ # ('selector', ItemSelector(key='char')), # ('tfidf', TfidfVectorizer(analyzer='char', preprocessor = identity, tokenizer = identity, ngram_range=(3,10))), # ])), # Pipeline for high info words bag-of-words model ('text_high', Pipeline([ ('selector', ItemSelector(key='text_high')), ('tfidf', TfidfVectorizer(preprocessor=identity, tokenizer=identity, max_df=.2)), ])), # Pipeline for POS tags # ('pos_tag', Pipeline([ # ('selector', ItemSelector(key='pos_tag')), # ('tfidf', TfidfVectorizer(preprocessor = identity, tokenizer = identity)), # ])), # Pipeline for named entity tags # ('named_ent', Pipeline([ # ('selector', ItemSelector(key='named_ent')), # ('tfidf', TfidfVectorizer(preprocessor = identity, tokenizer = identity)), # ])), ], # weight components in FeatureUnion transformer_weights={ # 'text': .3, # 'chars': .4, # 'text_high' : .7, # 'pos_tag': .1, }, )), # Use a classifier on the combined features ('clf', clfs[2]), ]) return classifier
class PassiveAggressive(AutoSklearnClassificationAlgorithm): def __init__(self, C, fit_intercept, n_iter, loss, random_state=None): self.C = float(C) self.fit_intercept = fit_intercept == 'True' self.n_iter = int(n_iter) self.loss = loss self.random_state = random_state self.estimator = None def fit(self, X, y): while not self.configuration_fully_fitted(): self.iterative_fit(X, y, n_iter=1) return self def iterative_fit(self, X, y, n_iter=1, refit=False): from sklearn.linear_model.passive_aggressive import \ PassiveAggressiveClassifier if refit: self.estimator = None if self.estimator is None: self.estimator = PassiveAggressiveClassifier( C=self.C, fit_intercept=self.fit_intercept, n_iter=1, loss=self.loss, shuffle=True, random_state=self.random_state, warm_start=True) self.classes_ = np.unique(y.astype(int)) self.estimator.n_iter += n_iter self.estimator.fit(X, y) return self def configuration_fully_fitted(self): if self.estimator is None: return False return not self.estimator.n_iter < self.n_iter def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(dataset_properties=None): return { 'shortname': 'PassiveAggressive Classifier', 'name': 'Passive Aggressive Stochastic Gradient Descent ' 'Classifier', 'handles_missing_values': False, 'handles_nominal_values': False, 'handles_numerical_features': True, 'prefers_data_scaled': True, 'prefers_data_normalized': True, 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, 'handles_sparse': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS, ), # TODO find out what is best used here! 'preferred_dtype': None } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): loss = CategoricalHyperparameter("loss", ["hinge", "squared_hinge"], default="hinge") fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") n_iter = UniformIntegerHyperparameter("n_iter", 5, 1000, default=20, log=True) C = UniformFloatHyperparameter("C", 1e-5, 10, 1, log=True) cs = ConfigurationSpace() cs.add_hyperparameter(loss) cs.add_hyperparameter(fit_intercept) cs.add_hyperparameter(n_iter) cs.add_hyperparameter(C) return cs
def cargaClassifiers(params,n_classes): gamma=params[0][0] n_gaussianRF=params[0][1] window_size=params[1][0] vecinos=params[1][1] hoja_size=params[1][2] #KNN and GRF_KNN clf_1 = KNN(n_neighbors=vecinos, leaf_size=hoja_size, max_window_size=window_size) clf_2 = GRF_KNN(n_neighbors=vecinos, leaf_size=hoja_size, max_window_size=window_size) clf_2.gamma=gamma clf_2.n_gaussianRF=n_gaussianRF #HoeffdingTree, HoeffdingTree_GRF clf_3 = HoeffdingTree() clf_4=GRF_HoeffdingTree() clf_4.gamma=gamma clf_4.n_gaussianRF=n_gaussianRF #HoeffdingAdaptiveTree and GRF_HoeffdingAdaptiveTree clf_5=HAT() clf_6=GRF_HoeffdingAdaptiveTree() clf_6.gamma=gamma clf_6.n_gaussianRF=n_gaussianRF #NaiveBayes and GRF_NaiveBayes # clf_7=NaiveBayes() # # clf_8=GRF_NaiveBayes() # clf_8.gamma=gamma # clf_8.n_gaussianRF=n_gaussianRF #GNB and GRF_GNB clf_9=GaussianNB() clf_10=GRF_GaussianNB() clf_10.gamma=gamma clf_10.n_gaussianRF=n_gaussianRF #SGDClassifier and GRF_SGDClassifier clf_11=SGDClassifier(max_iter=1) clf_12=GRF_SGDClassifier(max_iter=1) clf_12.gamma=gamma clf_12.n_gaussianRF=n_gaussianRF #Perceptron and GRF_Perceptron clf_13=SGDClassifier(loss='perceptron', eta0=1, learning_rate='constant', penalty=None,max_iter=1) clf_14=GRF_SGDClassifier(loss='perceptron', eta0=1, learning_rate='constant', penalty=None,max_iter=1) clf_14.gamma=gamma clf_14.n_gaussianRF=n_gaussianRF #PassiveAggressiveClassifier and GRF_PassiveAggressiveClassifier clf_15=PassiveAggressiveClassifier(max_iter=1) clf_16=GRF_PassiveAggressiveClassifier(max_iter=1) clf_16.gamma=gamma clf_16.n_gaussianRF=n_gaussianRF #MLPClassifier and GRF_MLPClassifier clf_17=MLPClassifier(batch_size=1,max_iter=1,hidden_layer_sizes=(100,)) clf_18=GRF_MLPClassifier(batch_size=1,max_iter=1,hidden_layer_sizes=(100,)) clf_18.gamma=gamma clf_18.n_gaussianRF=n_gaussianRF classifiers = [clf_1,clf_2,clf_3,clf_4,clf_5,clf_6,clf_9,clf_10,clf_11,clf_12,clf_13,clf_14,clf_15,clf_16,clf_17,clf_18] classifiers_init = [clf_1,clf_2,clf_3,clf_4,clf_5,clf_6,clf_9,clf_10,clf_11,clf_12,clf_13,clf_14,clf_15,clf_16,clf_17,clf_18] # classifiers = [clf_1,clf_2] # classifiers_init = [clf_1,clf_2] names=[] for c in range(len(classifiers)): classifier=classifiers[c] class_name='' if str(classifier)[26:33]=='GRF_KNN': class_name=str(classifier)[26:33] elif str(classifier)[22:25]=='KNN': class_name=str(classifier)[22:25] elif str(classifier)[34:47]=='HoeffdingTree': class_name='HT' elif str(classifier)[38:55]=='GRF_HoeffdingTree': class_name='GRF_HT' elif str(classifier)[43:46]=='HAT': class_name=str(classifier)[43:46] elif str(classifier)[47:72]=='GRF_HoeffdingAdaptiveTree': class_name='GRF_HAT' # elif str(classifier)[31:41]=='NaiveBayes': # class_name='MNB' # elif str(classifier)[35:49]=='GRF_NaiveBayes': # class_name='GRF_MNB' elif str(classifier)[0:10]=='GaussianNB': class_name='GNB' elif str(classifier)[0:14]=='GRF_GaussianNB': class_name='GRF_GNB' elif str(classifier)[0:13]=='SGDClassifier' and classifier.loss=='hinge': class_name='SGD' elif str(classifier)[0:17]=='GRF_SGDClassifier' and classifier.loss=='hinge': class_name='GRF_SGD' elif str(classifier)[0:13]=='SGDClassifier' and classifier.loss=='perceptron': class_name='Perceptron' elif str(classifier)[0:17]=='GRF_SGDClassifier' and classifier.loss=='perceptron': class_name='GRF_Perceptron' elif str(classifier)[0:27]=='PassiveAggressiveClassifier': class_name='PA' elif str(classifier)[0:31]=='GRF_PassiveAggressiveClassifier': class_name='GRF_PA' elif str(classifier)[0:13]=='MLPClassifier': class_name='MLP' elif str(classifier)[0:17]=='GRF_MLPClassifier': class_name='GRF_MLP' # elif str(classifier)[0:9]=='OnlineGRF': # class_name=str(classifier)[0:9] names.append(class_name) return classifiers,names,classifiers_init
class DocSearch(object): """ Index a set of documents. Can provide: * documents that match a list of keywords * suggestions for user input. * instances of documents """ INDEX_STEP_LOADING = "loading" INDEX_STEP_CLEANING = "cleaning" INDEX_STEP_CHECKING = "checking" INDEX_STEP_READING = "checking" INDEX_STEP_COMMIT = "commit" LABEL_STEP_UPDATING = "label updating" LABEL_STEP_DESTROYING = "label deletion" WHOOSH_SCHEMA = whoosh.fields.Schema( # static up to date schema docid=whoosh.fields.ID(stored=True, unique=True), doctype=whoosh.fields.ID(stored=True, unique=False), docfilehash=whoosh.fields.ID(stored=True), content=whoosh.fields.TEXT(spelling=True), label=whoosh.fields.KEYWORD(stored=True, commas=True, spelling=True, scorable=True), date=whoosh.fields.DATETIME(stored=True), last_read=whoosh.fields.DATETIME(stored=True), ) LABEL_ESTIMATOR_TEMPLATE = PassiveAggressiveClassifier(n_iter=50) """ Label_estimators is a dict with one estimator per label. Each label is predicted with its own estimator (OneVsAll strategy) We cannot use directly OneVsAllClassifier sklearn class because it doesn't support online learning (partial_fit) """ label_estimators = {} def __init__(self, rootdir, callback=dummy_progress_cb): """ Index files in rootdir (see constructor) Arguments: callback --- called during the indexation (may be called *often*). step : DocSearch.INDEX_STEP_READING or DocSearch.INDEX_STEP_SORTING progression : how many elements done yet total : number of elements to do document (only if step == DocSearch.INDEX_STEP_READING): file being read """ self.rootdir = rootdir base_indexdir = os.getenv("XDG_DATA_HOME", os.path.expanduser("~/.local/share")) self.indexdir = os.path.join(base_indexdir, "paperwork", "index") mkdir_p(self.indexdir) self.__docs_by_id = {} # docid --> doc self.label_list = [] need_index_rewrite = True try: logger.info("Opening index dir '%s' ..." % self.indexdir) self.index = whoosh.index.open_dir(self.indexdir) # check that the schema is up-to-date # We use the string representation of the schemas, because previous # versions of whoosh don't always implement __eq__ if str(self.index.schema) == str(self.WHOOSH_SCHEMA): need_index_rewrite = False except whoosh.index.EmptyIndexError, exc: logger.warning("Failed to open index '%s'" % self.indexdir) logger.warning("Exception was: %s" % str(exc)) if need_index_rewrite: logger.info("Creating a new index") self.index = whoosh.index.create_in(self.indexdir, self.WHOOSH_SCHEMA) logger.info("Index '%s' created" % self.indexdir) self.__searcher = self.index.searcher() class CustomFuzzy(whoosh.qparser.query.FuzzyTerm): def __init__(self, fieldname, text, boost=1.0, maxdist=1, prefixlength=0, constantscore=True): whoosh.qparser.query.FuzzyTerm.__init__( self, fieldname, text, boost, maxdist, prefixlength, constantscore=True ) facets = [whoosh.sorting.ScoreFacet(), whoosh.sorting.FieldFacet("date", reverse=True)] self.search_param_list = { 'full': [ { "query_parser": whoosh.qparser.MultifieldParser( ["label", "content"], schema=self.index.schema, termclass=CustomFuzzy), "sortedby": facets }, { "query_parser": whoosh.qparser.MultifieldParser( ["label", "content"], schema=self.index.schema, termclass=whoosh.qparser.query.Prefix), "sortedby": facets }, ], 'fast': [ { "query_parser": whoosh.qparser.MultifieldParser( ["label", "content"], schema=self.index.schema, termclass=whoosh.query.Term), "sortedby": facets }, ], } self.check_workdir() self.cleanup_rootdir(callback) self.reload_index(callback) self.label_estimators_dir = os.path.join( base_indexdir, "paperwork", "label_estimators") self.label_estimators_file = os.path.join( self.label_estimators_dir, "label_estimators.jbl") try: logger.info("Opening label_estimators file '%s' ..." % self.label_estimators_file) (l_estimators, ver) = joblib.load(self.label_estimators_file) if ver != BasicDoc.FEATURES_VER: logger.info("Estimator version is not up to date") self.label_estimators = {} else: self.label_estimators = l_estimators # check that the label_estimators are up to date for their class for label_name in self.label_estimators: params = self.label_estimators[label_name].get_params() if params != self.LABEL_ESTIMATOR_TEMPLATE.get_params(): raise IndexError('label_estimators params are not up to' + ' date') except Exception, exc: logger.error(("Failed to open label_estimator file '%s', or bad" + " label_estimator structure") % self.indexdir) logger.error("Exception was: %s" % exc) logger.info("Will create new label_estimators") self.label_estimators = {}
if ",GNB," in Functions: models.append(('GNB', GaussianNB())) if ",QDA," in Functions: models.append(('QDA', QuadraticDiscriminantAnalysis())) if ",GBC," in Functions: models.append(('GBC', GradientBoostingClassifier())) if ",ETC," in Functions: models.append(('ETC', ExtraTreeClassifier())) if ",BC," in Functions: models.append(('BC', BaggingClassifier())) if ",SGDC," in Functions: models.append(('SGDC', SGDClassifier())) if ",RC," in Functions: models.append(('RC', RidgeClassifier())) if ",PAC," in Functions: models.append(('PAC', PassiveAggressiveClassifier())) if ",ETSC," in Functions: models.append(('ETSC', ExtraTreesClassifier())) if ",BNB," in Functions: models.append(('BNB', BernoulliNB())) if ",GM," in Functions: models.append(('GM', GaussianMixture())) from sklearn.model_selection import KFold from collections import Counter Predictii = [[] for _ in range(len(Y_Test))] Accs = [] normlist = []
class PassiveAggressive(AutoSklearnClassificationAlgorithm): def __init__(self, C, fit_intercept, tol, loss, average, random_state=None): self.C = float(C) self.fit_intercept = fit_intercept == 'True' self.tol = float(tol) self.loss = loss self.average = average == 'True' self.random_state = random_state self.estimator = None def fit(self, X, y): n_iter = 2 self.iterative_fit(X, y, n_iter=n_iter, refit=True) while not self.configuration_fully_fitted(): n_iter *= 2 self.iterative_fit(X, y, n_iter=n_iter) return self def iterative_fit(self, X, y, n_iter=2, refit=False): from sklearn.linear_model.passive_aggressive import \ PassiveAggressiveClassifier # Need to fit at least two iterations, otherwise early stopping will not # work because we cannot determine whether the algorithm actually # converged. The only way of finding this out is if the sgd spends less # iterations than max_iter. If max_iter == 1, it has to spend at least # one iteration and will always spend at least one iteration, so we # cannot know about convergence. if refit: self.estimator = None if self.estimator is None: call_fit = True self.estimator = PassiveAggressiveClassifier( C=self.C, fit_intercept=self.fit_intercept, max_iter=n_iter, tol=self.tol, loss=self.loss, shuffle=True, random_state=self.random_state, warm_start=True, average=self.average, ) self.classes_ = np.unique(y.astype(int)) else: call_fit = False # Fallback for multilabel classification if len(y.shape) > 1 and y.shape[1] > 1: import sklearn.multiclass self.estimator.max_iter = 50 self.estimator = sklearn.multiclass.OneVsRestClassifier( self.estimator, n_jobs=1) self.estimator.fit(X, y) self.fully_fit_ = True else: if call_fit: self.estimator.fit(X, y) else: self.estimator.max_iter += n_iter self.estimator.max_iter = min(self.estimator.max_iter, 1000) self.estimator._validate_params() lr = "pa1" if self.estimator.loss == "hinge" else "pa2" self.estimator._partial_fit(X, y, alpha=1.0, C=self.estimator.C, loss="hinge", learning_rate=lr, max_iter=n_iter, classes=None, sample_weight=None, coef_init=None, intercept_init=None) if (self.estimator._max_iter >= 1000 or n_iter > self.estimator.n_iter_): self.fully_fit_ = True return self def configuration_fully_fitted(self): if self.estimator is None: return False elif not hasattr(self, 'fully_fit_'): return False else: return self.fully_fit_ def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(dataset_properties=None): return { 'shortname': 'PassiveAggressive Classifier', 'name': 'Passive Aggressive Classifier', 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'is_deterministic': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS, ) } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): C = UniformFloatHyperparameter("C", 1e-5, 10, 1.0, log=True) fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") loss = CategoricalHyperparameter("loss", ["hinge", "squared_hinge"], default_value="hinge") tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, default_value=1e-4, log=True) average = CategoricalHyperparameter('average', [False, True]) cs = ConfigurationSpace() cs.add_hyperparameters([loss, fit_intercept, tol, C, average]) return cs
class PassiveAggressive(AutoSklearnClassificationAlgorithm): def __init__(self, C, fit_intercept, n_iter, loss, random_state=None): super(PassiveAggressive, self).__init__() self.C = float(C) self.fit_intercept = fit_intercept == 'True' self.n_iter = int(n_iter) self.loss = loss self.random_state = random_state self.estimator = None def fit(self, X, y): while not self.configuration_fully_fitted(): self.iterative_fit(X, y, n_iter=1) return self def iterative_fit(self, X, y, n_iter=1, refit=False): from sklearn.linear_model.passive_aggressive import \ PassiveAggressiveClassifier if refit: self.estimator = None if self.estimator is None: self._iterations = 0 self.estimator = PassiveAggressiveClassifier( C=self.C, fit_intercept=self.fit_intercept, n_iter=1, loss=self.loss, shuffle=True, random_state=self.random_state, warm_start=True) self.classes_ = np.unique(y.astype(int)) # Fallback for multilabel classification if len(y.shape) > 1 and y.shape[1] > 1: import sklearn.multiclass self.estimator.n_iter = self.n_iter self.estimator = sklearn.multiclass.OneVsRestClassifier( self.estimator, n_jobs=1) self.estimator.fit(X, y) self.fully_fit_ = True else: # In the first iteration, there is not yet an intercept self.estimator.n_iter = n_iter self.estimator.partial_fit(X, y, classes=np.unique(y)) if self._iterations >= self.n_iter: self.fully_fit_ = True self._iterations += n_iter return self def configuration_fully_fitted(self): if self.estimator is None: return False elif not hasattr(self, 'fully_fit_'): return False else: return self.fully_fit_ def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(dataset_properties=None): return { 'shortname': 'PassiveAggressive Classifier', 'name': 'Passive Aggressive Classifier', 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'is_deterministic': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS, ) } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): loss = CategoricalHyperparameter("loss", ["hinge", "squared_hinge"], default="hinge") fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") n_iter = UniformIntegerHyperparameter("n_iter", 5, 1000, default=20, log=True) C = UniformFloatHyperparameter("C", 1e-5, 10, 1, log=True) cs = ConfigurationSpace() cs.add_hyperparameter(loss) cs.add_hyperparameter(fit_intercept) cs.add_hyperparameter(n_iter) cs.add_hyperparameter(C) return cs
class PassiveAggressive: def __init__(self, C, fit_intercept, tol, loss, average, random_state=None): self.C = C self.fit_intercept = fit_intercept self.average = average self.tol = tol self.loss = loss self.random_state = random_state self.estimator = None def fit(self, X, y, sample_weight=None): self.iterative_fit(X, y, n_iter=2, refit=True, sample_weight=sample_weight) iteration = 2 while not self.configuration_fully_fitted(): n_iter = int(2**iteration / 2) self.iterative_fit(X, y, n_iter=n_iter, sample_weight=sample_weight) iteration += 1 return self def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): from sklearn.linear_model.passive_aggressive import \ PassiveAggressiveClassifier # Need to fit at least two iterations, otherwise early stopping will not # work because we cannot determine whether the algorithm actually # converged. The only way of finding this out is if the sgd spends less # iterations than max_iter. If max_iter == 1, it has to spend at least # one iteration and will always spend at least one iteration, so we # cannot know about convergence. if refit: self.estimator = None if self.estimator is None: self.fully_fit_ = False self.average = check_for_bool(self.average) self.fit_intercept = check_for_bool(self.fit_intercept) self.tol = float(self.tol) self.C = float(self.C) call_fit = True self.estimator = PassiveAggressiveClassifier( C=self.C, fit_intercept=self.fit_intercept, max_iter=n_iter, tol=self.tol, loss=self.loss, shuffle=True, random_state=self.random_state, warm_start=True, average=self.average, ) self.classes_ = np.unique(y.astype(int)) else: call_fit = False # Fallback for multilabel classification if len(y.shape) > 1 and y.shape[1] > 1: import sklearn.multiclass self.estimator.max_iter = 50 self.estimator = sklearn.multiclass.OneVsRestClassifier( self.estimator, n_jobs=1) self.estimator.fit(X, y) self.fully_fit_ = True else: if call_fit: self.estimator.fit(X, y) else: self.estimator.max_iter += n_iter self.estimator.max_iter = min(self.estimator.max_iter, 1000) self.estimator._validate_params() lr = "pa1" if self.estimator.loss == "hinge" else "pa2" self.estimator._partial_fit(X, y, alpha=1.0, C=self.estimator.C, loss="hinge", learning_rate=lr, max_iter=n_iter, classes=None, sample_weight=sample_weight, coef_init=None, intercept_init=None) if (self.estimator._max_iter >= 1000 or n_iter > self.estimator.n_iter_): self.fully_fit_ = True return self def configuration_fully_fitted(self): if self.estimator is None: return False elif not hasattr(self, 'fully_fit_'): return False else: return self.fully_fit_ def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() df = self.estimator.decision_function(X) return softmax(df)
'NMF':NMF(), 'NearestCentroid':NearestCentroid(), 'NearestNeighbors':NearestNeighbors(), 'Normalizer':Normalizer(), 'NuSVC':NuSVC(), 'NuSVR':NuSVR(), 'Nystroem':Nystroem(), 'OAS':OAS(), 'OneClassSVM':OneClassSVM(), 'OrthogonalMatchingPursuit':OrthogonalMatchingPursuit(), 'OrthogonalMatchingPursuitCV':OrthogonalMatchingPursuitCV(), 'PCA':PCA(), 'PLSCanonical':PLSCanonical(), 'PLSRegression':PLSRegression(), 'PLSSVD':PLSSVD(), 'PassiveAggressiveClassifier':PassiveAggressiveClassifier(), 'PassiveAggressiveRegressor':PassiveAggressiveRegressor(), 'Perceptron':Perceptron(), 'ProjectedGradientNMF':ProjectedGradientNMF(), 'QuadraticDiscriminantAnalysis':QuadraticDiscriminantAnalysis(), 'RANSACRegressor':RANSACRegressor(), 'RBFSampler':RBFSampler(), 'RadiusNeighborsClassifier':RadiusNeighborsClassifier(), 'RadiusNeighborsRegressor':RadiusNeighborsRegressor(), 'RandomForestClassifier':RandomForestClassifier(), 'RandomForestRegressor':RandomForestRegressor(), 'RandomizedLasso':RandomizedLasso(), 'RandomizedLogisticRegression':RandomizedLogisticRegression(), 'RandomizedPCA':RandomizedPCA(), 'Ridge':Ridge(), 'RidgeCV':RidgeCV(),
def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): from sklearn.linear_model.passive_aggressive import \ PassiveAggressiveClassifier # Need to fit at least two iterations, otherwise early stopping will not # work because we cannot determine whether the algorithm actually # converged. The only way of finding this out is if the sgd spends less # iterations than max_iter. If max_iter == 1, it has to spend at least # one iteration and will always spend at least one iteration, so we # cannot know about convergence. if refit: self.estimator = None if self.estimator is None: self.fully_fit_ = False self.average = check_for_bool(self.average) self.fit_intercept = check_for_bool(self.fit_intercept) self.tol = float(self.tol) self.C = float(self.C) call_fit = True self.estimator = PassiveAggressiveClassifier( C=self.C, fit_intercept=self.fit_intercept, max_iter=n_iter, tol=self.tol, loss=self.loss, shuffle=True, random_state=self.random_state, warm_start=True, average=self.average, ) self.classes_ = np.unique(y.astype(int)) else: call_fit = False # Fallback for multilabel classification if len(y.shape) > 1 and y.shape[1] > 1: import sklearn.multiclass self.estimator.max_iter = 50 self.estimator = sklearn.multiclass.OneVsRestClassifier( self.estimator, n_jobs=1) self.estimator.fit(X, y) self.fully_fit_ = True else: if call_fit: self.estimator.fit(X, y) else: self.estimator.max_iter += n_iter self.estimator.max_iter = min(self.estimator.max_iter, 1000) self.estimator._validate_params() lr = "pa1" if self.estimator.loss == "hinge" else "pa2" self.estimator._partial_fit( X, y, alpha=1.0, C=self.estimator.C, loss="hinge", learning_rate=lr, max_iter=n_iter, classes=None, sample_weight=sample_weight, coef_init=None, intercept_init=None ) if ( self.estimator._max_iter >= 1000 or n_iter > self.estimator.n_iter_ ): self.fully_fit_ = True return self
class PassiveAggressive(AutoSklearnClassificationAlgorithm): def __init__(self, C, fit_intercept, n_iter, loss, random_state=None): super(PassiveAggressive, self).__init__() self.C = float(C) self.fit_intercept = fit_intercept == 'True' self.n_iter = int(n_iter) self.loss = loss self.random_state = random_state self.estimator = None def fit(self, X, y): while not self.configuration_fully_fitted(): self.iterative_fit(X, y, n_iter=1) return self def iterative_fit(self, X, y, n_iter=1, refit=False): from sklearn.linear_model.passive_aggressive import \ PassiveAggressiveClassifier if refit: self.estimator = None if self.estimator is None: self._iterations = 0 self.estimator = PassiveAggressiveClassifier( C=self.C, fit_intercept=self.fit_intercept, n_iter=1, loss=self.loss, shuffle=True, random_state=self.random_state, warm_start=True) self.classes_ = np.unique(y.astype(int)) # Fallback for multilabel classification if len(y.shape) > 1 and y.shape[1] > 1: import sklearn.multiclass self.estimator.n_iter = self.n_iter self.estimator = sklearn.multiclass.OneVsRestClassifier( self.estimator, n_jobs=1) self.estimator.fit(X, y) self.fully_fit_ = True else: # In the first iteration, there is not yet an intercept self.estimator.n_iter = n_iter self.estimator.partial_fit(X, y, classes=np.unique(y)) if self._iterations >= self.n_iter: self.fully_fit_ = True self._iterations += n_iter return self def configuration_fully_fitted(self): if self.estimator is None: return False elif not hasattr(self, 'fully_fit_'): return False else: return self.fully_fit_ def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(dataset_properties=None): return {'shortname': 'PassiveAggressive Classifier', 'name': 'Passive Aggressive Classifier', 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'is_deterministic': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): loss = CategoricalHyperparameter("loss", ["hinge", "squared_hinge"], default="hinge") fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") n_iter = UniformIntegerHyperparameter("n_iter", 5, 1000, default=20, log=True) C = UniformFloatHyperparameter("C", 1e-5, 10, 1, log=True) cs = ConfigurationSpace() cs.add_hyperparameter(loss) cs.add_hyperparameter(fit_intercept) cs.add_hyperparameter(n_iter) cs.add_hyperparameter(C) return cs