예제 #1
0
    def iterative_fit(self, X, y, n_iter=1, refit=False):
        from sklearn.linear_model.passive_aggressive import \
            PassiveAggressiveClassifier

        if refit:
            self.estimator = None

        if self.estimator is None:
            self._iterations = 0

            self.estimator = PassiveAggressiveClassifier(
                C=self.C, fit_intercept=self.fit_intercept, n_iter=1,
                loss=self.loss, shuffle=True, random_state=self.random_state,
                warm_start=True)
            self.classes_ = np.unique(y.astype(int))

        # Fallback for multilabel classification
        if len(y.shape) > 1 and y.shape[1] > 1:
            import sklearn.multiclass
            self.estimator.n_iter = self.n_iter
            self.estimator = sklearn.multiclass.OneVsRestClassifier(
                self.estimator, n_jobs=1)
            self.estimator.fit(X, y)
            self.fully_fit_ = True
        else:
            # In the first iteration, there is not yet an intercept

            self.estimator.n_iter = n_iter
            self.estimator.partial_fit(X, y, classes=np.unique(y))
            if self._iterations >= self.n_iter:
                self.fully_fit_ = True
            self._iterations += n_iter

        return self
예제 #2
0
def PassiveAggressive_classify(params, dataset, seed, classify):
    model_name = "PassiveAggressive"
    print(model_name, params, dataset, seed)
    np.random.seed(108)
    start_time = timeit.default_timer()
    train_X, train_y, test_X, test_y = gen_train_test_data(dataset, seed)
    # build a classifier based on selected parameters
    # C = UniformFloatHyperparameter("C", 1e-5, 10, 1.0, log=True)
    model = PassiveAggressiveClassifier(C=np.exp(params["C"]),
                                        max_iter=1000,
                                        tol=1e-3,
                                        random_state=108)
    if classify == "test":
        model.fit(train_X, train_y)
        pred_y = model.predict(test_X)
        # maximize accuracy
        auc = accuracy_score(test_y, pred_y)
    if classify == "cv":
        scores = cross_val_score(model, train_X, train_y, cv=cv_train)
        auc = np.mean(scores)
    # minimize loss
    loss = 1.0 - auc
    end_time = timeit.default_timer()
    print("{}_runtime: {}(s)".format(model_name, round(end_time - start_time,
                                                       2)))
    del model

    # dictionary with information for evaluation
    return {'auc': auc, 'loss': loss, 'status': STATUS_OK}
    def iterative_fit(self, X, y, n_iter=1, refit=False):
        from sklearn.linear_model.passive_aggressive import \
            PassiveAggressiveClassifier

        if refit:
            self.estimator = None

        if self.estimator is None:
            self._iterations = 0

            self.estimator = PassiveAggressiveClassifier(
                C=self.C, fit_intercept=self.fit_intercept, n_iter=1,
                loss=self.loss, shuffle=True, random_state=self.random_state,
                warm_start=True)
            self.classes_ = np.unique(y.astype(int))

        # Fallback for multilabel classification
        if len(y.shape) > 1 and y.shape[1] > 1:
            import sklearn.multiclass
            self.estimator.n_iter = self.n_iter
            self.estimator = sklearn.multiclass.OneVsRestClassifier(
                self.estimator, n_jobs=1)
            self.estimator.fit(X, y)
            self.fully_fit_ = True
        else:
            # In the first iteration, there is not yet an intercept

            self.estimator.n_iter = n_iter
            self.estimator.partial_fit(X, y, classes=np.unique(y))
            if self._iterations >= self.n_iter:
                self.fully_fit_ = True
            self._iterations += n_iter

        return self
예제 #4
0
 def fit(self, X, y=None):
     self._sklearn_model = SKLModel(**self._hyperparams)
     if (y is not None):
         self._sklearn_model.fit(X, y)
     else:
         self._sklearn_model.fit(X)
     return self
예제 #5
0
class PassiveAggressiveClassifierImpl():
    def __init__(self,
                 C=1.0,
                 fit_intercept=True,
                 max_iter=None,
                 tol=None,
                 early_stopping=False,
                 validation_fraction=0.1,
                 n_iter_no_change=5,
                 shuffle=True,
                 verbose=0,
                 loss='hinge',
                 n_jobs=None,
                 random_state=None,
                 warm_start=False,
                 class_weight='balanced',
                 average=False,
                 n_iter=None):
        self._hyperparams = {
            'C': C,
            'fit_intercept': fit_intercept,
            'max_iter': max_iter,
            'tol': tol,
            'early_stopping': early_stopping,
            'validation_fraction': validation_fraction,
            'n_iter_no_change': n_iter_no_change,
            'shuffle': shuffle,
            'verbose': verbose,
            'loss': loss,
            'n_jobs': n_jobs,
            'random_state': random_state,
            'warm_start': warm_start,
            'class_weight': class_weight,
            'average': average,
            'n_iter': n_iter
        }

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)
예제 #6
0
def model_translated_english():
    '''
    The model + pipeline for features extracted from the translated text (Spanish to English)
    '''
    clfs = [
        LinearSVC(),
        svm.SVC(kernel='linear', C=1.0),
        PassiveAggressiveClassifier(max_iter=250, tol=1e-3)
    ]

    classifier = Pipeline([
        # Extract the features
        ('features', FeaturesExtractor()),
        # Use FeatureUnion to combine the features from subject and body
        (
            'union',
            FeatureUnion(
                transformer_list=[
                    # Pipeline bag-of-words model
                    (
                        'words',
                        Pipeline([
                            ('selector', ItemSelector(key='text_translated')),
                            ('tfidf',
                             TfidfVectorizer(preprocessor=identity,
                                             tokenizer=identity,
                                             max_df=.2)),
                            ('chi-square', SelectKBest(chi2, 300)),  #3000)),
                        ])),

                    # Pipeline for high info words bag-of-words model
                    ('text_high',
                     Pipeline([
                         ('selector',
                          ItemSelector(key='text_translated_high')),
                         ('tfidf',
                          TfidfVectorizer(preprocessor=identity,
                                          tokenizer=identity,
                                          max_df=.2)),
                     ])),
                    ('char_n_grams',
                     Pipeline([('selector',
                                ItemSelector(key='text_ngram_translated')),
                               ('tfidf',
                                TfidfVectorizer(analyzer='char',
                                                ngram_range=(2, 6)))])),
                    ('word_n_grams',
                     Pipeline([('selector',
                                ItemSelector(key='text_ngram_translated')),
                               ('tfidf',
                                TfidfVectorizer(analyzer='word',
                                                ngram_range=(1, 3)))])),
                ], )),
        # Use a classifier on the combined features
        ('clf', clfs[2]),
    ])
    return classifier
예제 #7
0
def demo(output_file=None, instances=40000):
    """ _test_prequential
    
    This demo shows how to produce a prequential evaluation.
    
    The first thing needed is a stream. For this case we use a file stream 
    which gets its samples from the sea_big.csv file, inside the datasets 
    folder.
    
    Then we need to setup a classifier, which in this case is an instance 
    of sklearn's PassiveAggressiveClassifier. Then, optionally we create a 
    pipeline structure, initialized on that classifier.
    
    The evaluation is then run.
    
    Parameters
    ----------
    output_file: string
        The name of the csv output file
    
    instances: int
        The evaluation's max number of instances
    
    """
    # Setup the File Stream
    #opt = FileOption("FILE", "OPT_NAME", "../datasets/covtype.csv", "CSV", False)
    opt = FileOption("FILE", "OPT_NAME", "../datasets/sea_big.csv", "CSV",
                     False)
    stream = FileStream(opt, -1, 1)
    #stream = WaveformGenerator()
    stream.prepare_for_use()

    # Setup the classifier
    #classifier = SGDClassifier()
    # classifier = KNNAdwin(k=8, max_window_size=2000,leaf_size=40, categorical_list=None)
    #classifier = OzaBaggingAdwin(h=KNN(k=8, max_window_size=2000, leaf_size=30, categorical_list=None))
    classifier = PassiveAggressiveClassifier()
    #classifier = SGDRegressor()
    #classifier = PerceptronMask()

    # Setup the pipeline
    pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    eval = EvaluatePrequential(
        pretrain_size=200,
        max_instances=instances,
        batch_size=1,
        n_wait=100,
        max_time=1000,
        output_file=output_file,
        task_type='classification',
        show_plot=True,
        plot_options=['kappa', 'kappa_t', 'performance'])

    # Evaluate
    eval.eval(stream=stream, classifier=pipe)
    def iterative_fit(self, X, y, n_iter=1, refit=False):
        if refit:
            self.estimator = None

        if self.estimator is None:
            self.estimator = PassiveAggressiveClassifier(
                C=self.C,
                fit_intercept=self.fit_intercept,
                n_iter=1,
                loss=self.loss,
                shuffle=True,
                random_state=self.random_state,
                warm_start=True)
            self.classes_ = np.unique(y.astype(int))

        self.estimator.n_iter += n_iter
        self.estimator.fit(X, y)

        return self
예제 #9
0
def demo(output_file=None, instances=40000):
    """ _test_prequential
    
    This demo shows how to produce a prequential evaluation.
    
    The first thing needed is a stream. For this case we use a file stream 
    which gets its samples from the sea_big.csv file.
    
    Then we need to setup a classifier, which in this case is an instance 
    of sklearn's PassiveAggressiveClassifier. Then, optionally we create a 
    pipeline structure, initialized on that classifier.
    
    The evaluation is then run.
    
    Parameters
    ----------
    output_file: string
        The name of the csv output file
    
    instances: int
        The evaluation's max number of instances
    
    """
    # Setup the File Stream
    # stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/"
    #                     "master/sea_big.csv")
    # stream = WaveformGenerator()

    # Setup the classifier
    # classifier = SGDClassifier()
    # classifier = KNNADWINClassifier(n_neighbors=8, max_window_size=2000,leaf_size=40, nominal_attributes=None)
    # classifier = OzaBaggingADWINClassifier(base_estimator=KNNClassifier(n_neighbors=8, max_window_size=2000,
    #                                        leaf_size=30))
    classifier = PassiveAggressiveClassifier()
    # classifier = SGDRegressor()
    # classifier = PerceptronMask()

    # Setup the pipeline
    pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    evaluator = EvaluatePrequential(
        pretrain_size=200,
        max_samples=instances,
        batch_size=1,
        n_wait=100,
        max_time=1000,
        output_file=output_file,
        show_plot=True,
        metrics=['kappa', 'kappa_t', 'performance'])

    # Evaluate
    evaluator.evaluate(stream=stream, model=pipe)
예제 #10
0
    def iterative_fit(self, X, y, n_iter=1, refit=False):
        if refit:
            self.estimator = None

        if self.estimator is None:
            self.estimator = PassiveAggressiveClassifier(
                C=self.C, fit_intercept=self.fit_intercept, n_iter=1,
                loss=self.loss, shuffle=True, random_state=self.random_state,
                warm_start=True)
            self.classes_ = np.unique(y.astype(int))

        self.estimator.n_iter += n_iter
        self.estimator.fit(X, y)

        return self
예제 #11
0
 def __init__(self, C=1.0, fit_intercept=True, max_iter=None, tol=None, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, shuffle=True, verbose=0, loss='hinge', n_jobs=None, random_state=None, warm_start=False, class_weight='balanced', average=False, n_iter=None):
     self._hyperparams = {
         'C': C,
         'fit_intercept': fit_intercept,
         'max_iter': max_iter,
         'tol': tol,
         'early_stopping': early_stopping,
         'validation_fraction': validation_fraction,
         'n_iter_no_change': n_iter_no_change,
         'shuffle': shuffle,
         'verbose': verbose,
         'loss': loss,
         'n_jobs': n_jobs,
         'random_state': random_state,
         'warm_start': warm_start,
         'class_weight': class_weight,
         'average': average,
         'n_iter': n_iter}
     self._wrapped_model = SKLModel(**self._hyperparams)
예제 #12
0
def model_title():
    '''
    The model + pipeline for features extracted from the title
    '''
    clfs = [
        LinearSVC(),
        svm.SVC(kernel='linear', C=1.0),
        PassiveAggressiveClassifier()
    ]

    classifier = Pipeline([
        # Extract the features
        ('features', FeaturesExtractor()),
        # Use FeatureUnion to combine the features from subject and body
        (
            'union',
            FeatureUnion(
                transformer_list=[

                    # Pipeline for title words
                    ('title',
                     Pipeline([
                         ('selector', ItemSelector(key='title')),
                         ('tfidf',
                          TfidfVectorizer(preprocessor=identity,
                                          tokenizer=identity)),
                     ])),
                ],

                # weight components in FeatureUnion
                transformer_weights={
                    # 'title': .3,
                },
            )),
        # Use a classifier on the combined features
        ('clf', clfs[2]),
    ])
    return classifier
예제 #13
0
    'LDA': (True, LinearDiscriminantAnalysis(solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001) ),
    'LogisticRegression': (True, LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='warn', max_iter=100, multi_class='warn', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None) ),
    'CalibratedClassifierCV': (True, CalibratedClassifierCV(base_estimator=None, method='sigmoid', cv='warn') ),
    'LinearSVC': (True, LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000) ),
    'LinearSVM': ( True, SVC(kernel='linear', C=0.025) ),  # (C=0.01, penalty='l1', dual=False) ),
    'RBF_SVM': (True, SVC(gamma='auto') ),#gamma=2, C=1) ), #
    'Nu_SVM': (True, NuSVC(gamma='auto') ),
    'GaussianProcess': (False, GaussianProcessClassifier() ), #(1.0 * RBF(1.0)) ),
    'NeuralNet': (True, MLPClassifier(alpha=1, max_iter=1000) ),
    'QDA': (True, QuadraticDiscriminantAnalysis() ),
    'NaiveBayes': (True,  GaussianNB() ),
    'RadiusNeighborsClassifier': (True, RadiusNeighborsClassifier() ),
    'SGDClassifier': (True, SGDClassifier() ),
    'RidgeClassifierCV': (True, RidgeClassifierCV() ),
    'RidgeClassifier': (True, RidgeClassifier() ),
    'PassiveAggressiveClassifier': (True, PassiveAggressiveClassifier() ),
    'LabelPropagation': (True, LabelPropagation() ),
    'LabelSpreading': (False, LabelSpreading() ),
    'MultinomialNB': (True, MultinomialNB() ),
    'NearestCentroid': (True, NearestCentroid() ),
    'Perceptron': (True, Perceptron() ),
}


# feature_set is used for manually enabling the individual features.
# NOTE:  setting boolean value, eanbles/disables feature.
feature_set = {
    'backers_count': True,
    'converted_pledged_amount': True,
    'goal': True,
    'country': True,
예제 #14
0
class PassiveAggressive(ParamSklearnClassificationAlgorithm):
    def __init__(self, C, fit_intercept, n_iter, loss, random_state=None):
        self.C = float(C)
        self.fit_intercept = fit_intercept == 'True'
        self.n_iter = int(n_iter)
        self.loss = loss
        self.random_state = random_state
        self.estimator = None

    def fit(self, X, y):
        while not self.configuration_fully_fitted():
            self.iterative_fit(X, y, n_iter=1)

        return self

    def iterative_fit(self, X, y, n_iter=1, refit=False):
        if refit:
            self.estimator = None

        if self.estimator is None:
            self.estimator = PassiveAggressiveClassifier(
                C=self.C, fit_intercept=self.fit_intercept, n_iter=1,
                loss=self.loss, shuffle=True, random_state=self.random_state,
                warm_start=True)
            self.classes_ = np.unique(y.astype(int))

        self.estimator.n_iter += n_iter
        self.estimator.fit(X, y)

        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        return not self.estimator.n_iter < self.n_iter

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()

        df = self.estimator.decision_function(X)
        return softmax(df)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {'shortname': 'PassiveAggressive Classifier',
                'name': 'Passive Aggressive Stochastic Gradient Descent '
                        'Classifier',
                'handles_missing_values': False,
                'handles_nominal_values': False,
                'handles_numerical_features': True,
                'prefers_data_scaled': True,
                'prefers_data_normalized': True,
                'handles_regression': False,
                'handles_classification': True,
                'handles_multiclass': True,
                'handles_multilabel': False,
                'is_deterministic': True,
                'handles_sparse': True,
                'input': (DENSE, SPARSE, UNSIGNED_DATA),
                'output': (PREDICTIONS,),
                # TODO find out what is best used here!
                'preferred_dtype': None}

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        loss = CategoricalHyperparameter("loss",
                                         ["hinge", "squared_hinge"],
                                         default="hinge")
        fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True")
        n_iter = UniformIntegerHyperparameter("n_iter", 5, 1000, default=20)
        C = UniformFloatHyperparameter("C", 1e-5, 10, 1, log=True)
        cs = ConfigurationSpace()
        cs.add_hyperparameter(loss)
        cs.add_hyperparameter(fit_intercept)
        cs.add_hyperparameter(n_iter)
        cs.add_hyperparameter(C)
        return cs
예제 #15
0
class PassiveAggressive(
        IterativeComponentWithSampleWeight,
        BaseClassificationModel,
):
    def __init__(self,
                 C,
                 fit_intercept,
                 tol,
                 loss,
                 average,
                 random_state=None):
        self.C = C
        self.fit_intercept = fit_intercept
        self.average = average
        self.tol = tol
        self.loss = loss
        self.random_state = random_state
        self.estimator = None
        self.time_limit = None
        self.start_time = time.time()

    def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None):
        from sklearn.linear_model.passive_aggressive import \
            PassiveAggressiveClassifier

        # Need to fit at least two iterations, otherwise early stopping will not
        # work because we cannot determine whether the algorithm actually
        # converged. The only way of finding this out is if the sgd spends less
        # iterations than max_iter. If max_iter == 1, it has to spend at least
        # one iteration and will always spend at least one iteration, so we
        # cannot know about convergence.

        if refit:
            self.estimator = None

        if self.estimator is None:
            self.fully_fit_ = False

            self.average = check_for_bool(self.average)
            self.fit_intercept = check_for_bool(self.fit_intercept)
            self.tol = float(self.tol)
            self.C = float(self.C)

            call_fit = True
            self.estimator = PassiveAggressiveClassifier(
                C=self.C,
                fit_intercept=self.fit_intercept,
                max_iter=n_iter,
                tol=self.tol,
                loss=self.loss,
                shuffle=True,
                random_state=self.random_state,
                warm_start=True,
                average=self.average,
            )
            self.classes_ = np.unique(y.astype(int))
        else:
            call_fit = False

        # Fallback for multilabel classification
        if len(y.shape) > 1 and y.shape[1] > 1:
            import sklearn.multiclass
            self.estimator.max_iter = 50
            self.estimator = sklearn.multiclass.OneVsRestClassifier(
                self.estimator, n_jobs=1)
            self.estimator.fit(X, y)
            self.fully_fit_ = True
        else:
            if call_fit:
                self.estimator.fit(X, y)
            else:
                self.estimator.max_iter += n_iter
                self.estimator.max_iter = min(self.estimator.max_iter, 1000)
                self.estimator._validate_params()
                lr = "pa1" if self.estimator.loss == "hinge" else "pa2"
                self.estimator._partial_fit(X,
                                            y,
                                            alpha=1.0,
                                            C=self.estimator.C,
                                            loss="hinge",
                                            learning_rate=lr,
                                            max_iter=n_iter,
                                            classes=None,
                                            sample_weight=sample_weight,
                                            coef_init=None,
                                            intercept_init=None)
                if (self.estimator.max_iter >= 1000
                        or n_iter > self.estimator.n_iter_):
                    self.fully_fit_ = True

        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        elif not hasattr(self, 'fully_fit_'):
            return False
        else:
            return self.fully_fit_

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()

        df = self.estimator.decision_function(X)
        return softmax(df)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            'shortname': 'PassiveAggressive Classifier',
            'name': 'Passive Aggressive Classifier',
            'handles_regression': False,
            'handles_classification': True,
            'handles_multiclass': True,
            'handles_multilabel': True,
            'is_deterministic': True,
            'input': (DENSE, SPARSE, UNSIGNED_DATA),
            'output': (PREDICTIONS, )
        }

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None,
                                        optimizer='smac'):
        if optimizer == 'smac':
            C = UniformFloatHyperparameter("C", 1e-5, 10, 1.0, log=True)
            fit_intercept = UnParametrizedHyperparameter(
                "fit_intercept", "True")
            loss = CategoricalHyperparameter("loss",
                                             ["hinge", "squared_hinge"],
                                             default_value="hinge")

            tol = UniformFloatHyperparameter("tol",
                                             1e-5,
                                             1e-1,
                                             default_value=1e-4,
                                             log=True)
            # Note: Average could also be an Integer if > 1
            average = CategoricalHyperparameter('average', ['False', 'True'],
                                                default_value='False')

            cs = ConfigurationSpace()
            cs.add_hyperparameters([loss, fit_intercept, tol, C, average])
            return cs
        elif optimizer == 'tpe':
            space = {
                'C': hp.loguniform("pa_C", np.log(1e-5), np.log(10)),
                'fit_intercept': hp.choice('pa_fit_intercept', ["True"]),
                'loss': hp.choice('pr_loss', ["hinge", "squared_hinge"]),
                'tol': hp.loguniform('pr_tol', np.log(1e-5), np.log(1e-1)),
                'average': hp.choice('pr_average', ["False", "True"])
            }

            init_trial = {
                'C': 1,
                'fit_intercept': "True",
                'loss': "hinge",
                'tol': 1e-4,
                'average': "False"
            }

            return space
예제 #16
0
def model_words():
    '''
    The model + pipeline for features extracted from the text
    '''
    clfs = [
        LinearSVC(),
        svm.SVC(kernel='linear', C=1.0),
        PassiveAggressiveClassifier(C=1,
                                    max_iter=1000,
                                    tol=1e-3,
                                    n_jobs=-1,
                                    class_weight="balanced"),
        PassiveAggressiveClassifier(C=0.1,
                                    max_iter=1500,
                                    tol=0.01,
                                    n_jobs=-1,
                                    class_weight="balanced",
                                    fit_intercept=False,
                                    loss="squared_hinge"),
        AdaBoostClassifier(n_estimators=200),
        MultinomialNB(),
    ]

    classifier = Pipeline([
        # Extract the features
        ('features', FeaturesExtractor()),
        # Use FeatureUnion to combine the features from subject and body
        (
            'union',
            FeatureUnion(
                transformer_list=[
                    ('text_high',
                     Pipeline([
                         ('selector', ItemSelector(key='text_high')),
                         ('tfidf',
                          TfidfVectorizer(preprocessor=identity,
                                          tokenizer=identity,
                                          max_df=.2)),
                     ])),
                    ('word_n_grams',
                     Pipeline([('selector', ItemSelector(key='sentence')),
                               ('tfidf',
                                TfidfVectorizer(analyzer='word',
                                                ngram_range=(1, 5)))])),
                    ('char_n_grams',
                     Pipeline([('selector', ItemSelector(key='sentence')),
                               ('tfidf',
                                TfidfVectorizer(analyzer='char',
                                                ngram_range=(2, 5)))])),
                    ('sentiment',
                     Pipeline([('selector', ItemSelector(key='sentiment')),
                               ('tfidf', TfidfVectorizer(analyzer='char'))])),
                    ('opinion_towards',
                     Pipeline([
                         ('selector', ItemSelector(key='opinion')),
                     ])),
                    ('target',
                     Pipeline([
                         ('selector', ItemSelector(key='target')),
                     ])),

                    #### FEATURES THAT DO NOT WORK ####

                    # ('sentiment_cont', Pipeline([
                    #     ('selector', ItemSelector(key='sentence')),
                    #     ('feature', SentimentContinuous())
                    # ])),

                    # ('glove', Pipeline([
                    #     ('selector', ItemSelector(key='sentence')),
                    #     ('tfidf', TfidfEmbeddingVectorizer(glove))
                    # ])),

                    # ('sentence_length', Pipeline([
                    #     ('selector', ItemSelector(key='sentence_length')),
                    #     ('scaler', MinMaxScaler())
                    # ])),
                ],

                # weight components in FeatureUnion
                transformer_weights={
                    'text_high': 1,
                    'word_n_grams': .8,
                    'char_n_grams': .8,
                    'sentiment': .8,
                    'opinion_towards': 1,
                    'target': 1,
                },
            )),
        # Use a classifier on the combined features
        ('clf', clfs[2]),
    ])
    return classifier
예제 #17
0
def model_words():
    '''
    The model + pipeline for features extracted from the text
    '''
    clfs = [
        LinearSVC(),
        svm.SVC(kernel='linear', C=1.0),
        PassiveAggressiveClassifier(max_iter=250, tol=1e-3),
        PassiveAggressiveClassifier(C=0.001,
                                    class_weight="balanced",
                                    fit_intercept=False,
                                    loss="squared_hinge",
                                    max_iter=7500)
    ]

    classifier = Pipeline([
        # Extract the features
        ('features', FeaturesExtractor()),
        # Use FeatureUnion to combine the features from subject and body
        (
            'union',
            FeatureUnion(
                transformer_list=[
                    # Pipeline bag-of-words model
                    (
                        'words',
                        Pipeline([
                            ('selector', ItemSelector(key='text')),
                            ('tfidf',
                             TfidfVectorizer(preprocessor=identity,
                                             tokenizer=identity,
                                             max_df=.2)),
                            ('chi-square', SelectKBest(chi2, 300)),  #3000)),
                        ])),

                    # Pipeline for high info words bag-of-words model
                    ('text_high',
                     Pipeline([
                         ('selector', ItemSelector(key='text_high')),
                         ('tfidf',
                          TfidfVectorizer(preprocessor=identity,
                                          tokenizer=identity,
                                          max_df=.2)),
                     ])),

                    # ('char_n_grams', Pipeline([
                    #     ('selector', ItemSelector(key='text_ngram')),
                    #     ('tfidf', TfidfVectorizer(analyzer='char', ngram_range=(2,6)))
                    # ])),
                    ('word_n_grams',
                     Pipeline([('selector', ItemSelector(key='text_ngram')),
                               ('tfidf',
                                TfidfVectorizer(analyzer='word',
                                                ngram_range=(1, 3)))])),

                    # ('sentiment_cont', Pipeline([
                    #     ('selector', ItemSelector(key='text_ngram')),
                    #     ('feature', SentimentContinuous())
                    # ])),
                ], )),
        # Use a classifier on the combined features
        ('clf', clfs[3]),
    ])
    return classifier
예제 #18
0
    def iterative_fit(self, X, y, n_iter=2, refit=False):
        from sklearn.linear_model.passive_aggressive import \
            PassiveAggressiveClassifier

        # Need to fit at least two iterations, otherwise early stopping will not
        # work because we cannot determine whether the algorithm actually
        # converged. The only way of finding this out is if the sgd spends less
        # iterations than max_iter. If max_iter == 1, it has to spend at least
        # one iteration and will always spend at least one iteration, so we
        # cannot know about convergence.

        if refit:
            self.estimator = None

        if self.estimator is None:
            call_fit = True
            self.estimator = PassiveAggressiveClassifier(
                C=self.C,
                fit_intercept=self.fit_intercept,
                max_iter=n_iter,
                tol=self.tol,
                loss=self.loss,
                shuffle=True,
                random_state=self.random_state,
                warm_start=True,
                average=self.average,
            )
            self.classes_ = np.unique(y.astype(int))
        else:
            call_fit = False

        # Fallback for multilabel classification
        if len(y.shape) > 1 and y.shape[1] > 1:
            import sklearn.multiclass
            self.estimator.max_iter = 50
            self.estimator = sklearn.multiclass.OneVsRestClassifier(
                self.estimator, n_jobs=1)
            self.estimator.fit(X, y)
            self.fully_fit_ = True
        else:
            if call_fit:
                self.estimator.fit(X, y)
            else:
                self.estimator.max_iter += n_iter
                self.estimator.max_iter = min(self.estimator.max_iter, 1000)
                self.estimator._validate_params()
                lr = "pa1" if self.estimator.loss == "hinge" else "pa2"
                self.estimator._partial_fit(X,
                                            y,
                                            alpha=1.0,
                                            C=self.estimator.C,
                                            loss="hinge",
                                            learning_rate=lr,
                                            max_iter=n_iter,
                                            classes=None,
                                            sample_weight=None,
                                            coef_init=None,
                                            intercept_init=None)
                if (self.estimator._max_iter >= 1000
                        or n_iter > self.estimator.n_iter_):
                    self.fully_fit_ = True

        return self
예제 #19
0
class PassiveAggressive(
    IterativeComponentWithSampleWeight,
    AutoSklearnClassificationAlgorithm,
):
    def __init__(self, C, fit_intercept, tol, loss, average, random_state=None):
        self.C = C
        self.fit_intercept = fit_intercept
        self.average = average
        self.tol = tol
        self.loss = loss
        self.random_state = random_state
        self.estimator = None


    def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None):
        from sklearn.linear_model.passive_aggressive import \
            PassiveAggressiveClassifier

        # Need to fit at least two iterations, otherwise early stopping will not
        # work because we cannot determine whether the algorithm actually
        # converged. The only way of finding this out is if the sgd spends less
        # iterations than max_iter. If max_iter == 1, it has to spend at least
        # one iteration and will always spend at least one iteration, so we
        # cannot know about convergence.

        if refit:
            self.estimator = None

        if self.estimator is None:
            self.fully_fit_ = False

            self.average = check_for_bool(self.average)
            self.fit_intercept = check_for_bool(self.fit_intercept)
            self.tol = float(self.tol)
            self.C = float(self.C)

            call_fit = True
            self.estimator = PassiveAggressiveClassifier(
                C=self.C,
                fit_intercept=self.fit_intercept,
                max_iter=n_iter,
                tol=self.tol,
                loss=self.loss,
                shuffle=True,
                random_state=self.random_state,
                warm_start=True,
                average=self.average,
            )
            self.classes_ = np.unique(y.astype(int))
        else:
            call_fit = False

        # Fallback for multilabel classification
        if len(y.shape) > 1 and y.shape[1] > 1:
            import sklearn.multiclass
            self.estimator.max_iter = 50
            self.estimator = sklearn.multiclass.OneVsRestClassifier(
                self.estimator, n_jobs=1)
            self.estimator.fit(X, y)
            self.fully_fit_ = True
        else:
            if call_fit:
                self.estimator.fit(X, y)
            else:
                self.estimator.max_iter += n_iter
                self.estimator.max_iter = min(self.estimator.max_iter,
                                              1000)
                self.estimator._validate_params()
                lr = "pa1" if self.estimator.loss == "hinge" else "pa2"
                self.estimator._partial_fit(
                    X, y,
                    alpha=1.0,
                    C=self.estimator.C,
                    loss="hinge",
                    learning_rate=lr,
                    max_iter=n_iter,
                    classes=None,
                    sample_weight=sample_weight,
                    coef_init=None,
                    intercept_init=None
                )
                if (
                    self.estimator._max_iter >= 1000
                    or n_iter > self.estimator.n_iter_
                ):
                    self.fully_fit_ = True

        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        elif not hasattr(self, 'fully_fit_'):
            return False
        else:
            return self.fully_fit_

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()

        df = self.estimator.decision_function(X)
        return softmax(df)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {'shortname': 'PassiveAggressive Classifier',
                'name': 'Passive Aggressive Classifier',
                'handles_regression': False,
                'handles_classification': True,
                'handles_multiclass': True,
                'handles_multilabel': True,
                'is_deterministic': True,
                'input': (DENSE, SPARSE, UNSIGNED_DATA),
                'output': (PREDICTIONS,)}

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        C = UniformFloatHyperparameter("C", 1e-5, 10, 1.0, log=True)
        fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True")
        loss = CategoricalHyperparameter(
            "loss", ["hinge", "squared_hinge"], default_value="hinge"
        )

        tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, default_value=1e-4,
                                         log=True)
        # Note: Average could also be an Integer if > 1
        average = CategoricalHyperparameter('average', ['False', 'True'],
                                            default_value='False')

        cs = ConfigurationSpace()
        cs.add_hyperparameters([loss, fit_intercept, tol, C, average])
        return cs
예제 #20
0
def __get_classifier_model(classifier, args):
    """
    Convenience function for obtaining a classification model

    Args:
        classifier(str): A string indicating the name of the classifier
        args: An arguments object

    Returns:
        A classification model based on the given classifier string
    """
    # Make SGD Logistic Regression model the default
    model = SGDClassifier(loss='log',
                          penalty='l2',
                          shuffle=True,
                          n_iter=5,
                          n_jobs=-1,
                          random_state=179)
    if classifier == SVM:
        model = SVC(kernel=args.kernel,
                    class_weight="balanced",
                    cache_size=8096,
                    random_state=17,
                    probability=True)
    elif classifier == ADA_BOOST:
        dt = DecisionTreeClassifier(max_depth=15,
                                    criterion='gini',
                                    max_features='auto',
                                    class_weight='balanced',
                                    random_state=39)
        model = AdaBoostClassifier(base_estimator=dt,
                                   n_estimators=400,
                                   random_state=17)
    elif classifier == RF:
        # Configure the classifier to use all available CPU cores
        model = RandomForestClassifier(class_weight="balanced",
                                       n_jobs=-1,
                                       n_estimators=400,
                                       random_state=17,
                                       max_features='auto',
                                       max_depth=15,
                                       criterion='gini')
    elif classifier == GRADIENT_BOOST:
        model = GradientBoostingClassifier(random_state=17,
                                           n_estimators=400,
                                           max_features='auto')
    elif classifier == EXTRA_TREES:
        model = ExtraTreesClassifier(random_state=17,
                                     n_estimators=400,
                                     n_jobs=-1,
                                     class_weight='balanced',
                                     max_depth=15,
                                     max_features='auto',
                                     criterion='gini')
    elif classifier == BAGGING:
        dt = DecisionTreeClassifier(max_depth=15,
                                    criterion='gini',
                                    max_features='auto',
                                    class_weight='balanced',
                                    random_state=39)
        model = BaggingClassifier(base_estimator=dt,
                                  n_estimators=400,
                                  random_state=17,
                                  n_jobs=-1,
                                  max_features=0.8,
                                  max_samples=0.8,
                                  bootstrap=False)
    elif classifier == PASSIVE_AGGRESSIVE:
        model = PassiveAggressiveClassifier(n_iter=10,
                                            class_weight='balanced',
                                            n_jobs=-1,
                                            random_state=41)
    elif classifier == PERCEPTRON:
        model = Perceptron(n_jobs=-1,
                           n_iter=10,
                           penalty='l2',
                           class_weight='balanced',
                           alpha=0.25)
    return model
예제 #21
0
def model_words():
    '''
    The model + pipeline for features extracted from the text
    '''
    clfs = [
        LinearSVC(),
        svm.SVC(kernel='linear', C=1.0),
        PassiveAggressiveClassifier()
    ]

    classifier = Pipeline([
        # Extract the features
        ('features', FeaturesExtractor()),
        # Use FeatureUnion to combine the features from subject and body
        (
            'union',
            FeatureUnion(
                #n_jobs = -1,
                transformer_list=[
                    # Pipeline bag-of-words model
                    ('words',
                     Pipeline([
                         ('selector', ItemSelector(key='text')),
                         ('tfidf',
                          TfidfVectorizer(preprocessor=identity,
                                          tokenizer=identity,
                                          max_df=.2)),
                         ('chi-square', SelectKBest(chi2, 3000)),
                     ])),

                    # # Pipeline for character features
                    # ('chars', Pipeline([
                    #     ('selector', ItemSelector(key='char')),
                    #     ('tfidf', TfidfVectorizer(analyzer='char', preprocessor = identity, tokenizer = identity, ngram_range=(3,10))),
                    # ])),

                    # Pipeline for high info words bag-of-words model
                    ('text_high',
                     Pipeline([
                         ('selector', ItemSelector(key='text_high')),
                         ('tfidf',
                          TfidfVectorizer(preprocessor=identity,
                                          tokenizer=identity,
                                          max_df=.2)),
                     ])),

                    # Pipeline for POS tags
                    # ('pos_tag', Pipeline([
                    #     ('selector', ItemSelector(key='pos_tag')),
                    #     ('tfidf', TfidfVectorizer(preprocessor = identity, tokenizer = identity)),
                    # ])),

                    # Pipeline for named entity tags
                    # ('named_ent', Pipeline([
                    #     ('selector', ItemSelector(key='named_ent')),
                    #     ('tfidf', TfidfVectorizer(preprocessor = identity, tokenizer = identity)),
                    # ])),
                ],

                # weight components in FeatureUnion
                transformer_weights={
                    # 'text': .3,
                    # 'chars': .4,
                    # 'text_high' : .7,
                    # 'pos_tag': .1,
                },
            )),
        # Use a classifier on the combined features
        ('clf', clfs[2]),
    ])
    return classifier
예제 #22
0
class PassiveAggressive(AutoSklearnClassificationAlgorithm):
    def __init__(self, C, fit_intercept, n_iter, loss, random_state=None):
        self.C = float(C)
        self.fit_intercept = fit_intercept == 'True'
        self.n_iter = int(n_iter)
        self.loss = loss
        self.random_state = random_state
        self.estimator = None

    def fit(self, X, y):
        while not self.configuration_fully_fitted():
            self.iterative_fit(X, y, n_iter=1)

        return self

    def iterative_fit(self, X, y, n_iter=1, refit=False):
        from sklearn.linear_model.passive_aggressive import \
            PassiveAggressiveClassifier

        if refit:
            self.estimator = None

        if self.estimator is None:
            self.estimator = PassiveAggressiveClassifier(
                C=self.C,
                fit_intercept=self.fit_intercept,
                n_iter=1,
                loss=self.loss,
                shuffle=True,
                random_state=self.random_state,
                warm_start=True)
            self.classes_ = np.unique(y.astype(int))

        self.estimator.n_iter += n_iter
        self.estimator.fit(X, y)

        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        return not self.estimator.n_iter < self.n_iter

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()

        df = self.estimator.decision_function(X)
        return softmax(df)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            'shortname': 'PassiveAggressive Classifier',
            'name': 'Passive Aggressive Stochastic Gradient Descent '
            'Classifier',
            'handles_missing_values': False,
            'handles_nominal_values': False,
            'handles_numerical_features': True,
            'prefers_data_scaled': True,
            'prefers_data_normalized': True,
            'handles_regression': False,
            'handles_classification': True,
            'handles_multiclass': True,
            'handles_multilabel': False,
            'is_deterministic': True,
            'handles_sparse': True,
            'input': (DENSE, SPARSE, UNSIGNED_DATA),
            'output': (PREDICTIONS, ),
            # TODO find out what is best used here!
            'preferred_dtype': None
        }

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        loss = CategoricalHyperparameter("loss", ["hinge", "squared_hinge"],
                                         default="hinge")
        fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True")
        n_iter = UniformIntegerHyperparameter("n_iter",
                                              5,
                                              1000,
                                              default=20,
                                              log=True)
        C = UniformFloatHyperparameter("C", 1e-5, 10, 1, log=True)
        cs = ConfigurationSpace()
        cs.add_hyperparameter(loss)
        cs.add_hyperparameter(fit_intercept)
        cs.add_hyperparameter(n_iter)
        cs.add_hyperparameter(C)
        return cs
예제 #23
0
def cargaClassifiers(params,n_classes):

    gamma=params[0][0]
    n_gaussianRF=params[0][1]
    window_size=params[1][0]
    vecinos=params[1][1]
    hoja_size=params[1][2]
    
    #KNN and GRF_KNN
    clf_1 = KNN(n_neighbors=vecinos, leaf_size=hoja_size, max_window_size=window_size)
    
    clf_2 = GRF_KNN(n_neighbors=vecinos, leaf_size=hoja_size, max_window_size=window_size)
    clf_2.gamma=gamma
    clf_2.n_gaussianRF=n_gaussianRF
    
    #HoeffdingTree, HoeffdingTree_GRF
    clf_3 = HoeffdingTree()
    
    clf_4=GRF_HoeffdingTree()
    clf_4.gamma=gamma
    clf_4.n_gaussianRF=n_gaussianRF
    
    #HoeffdingAdaptiveTree and GRF_HoeffdingAdaptiveTree
    clf_5=HAT()
    
    clf_6=GRF_HoeffdingAdaptiveTree()
    clf_6.gamma=gamma
    clf_6.n_gaussianRF=n_gaussianRF
    
    #NaiveBayes and GRF_NaiveBayes
#    clf_7=NaiveBayes()
#    
#    clf_8=GRF_NaiveBayes()
#    clf_8.gamma=gamma
#    clf_8.n_gaussianRF=n_gaussianRF

    #GNB and GRF_GNB
    clf_9=GaussianNB()
    
    clf_10=GRF_GaussianNB()
    clf_10.gamma=gamma
    clf_10.n_gaussianRF=n_gaussianRF

    #SGDClassifier and GRF_SGDClassifier
    clf_11=SGDClassifier(max_iter=1)
    
    clf_12=GRF_SGDClassifier(max_iter=1)
    clf_12.gamma=gamma
    clf_12.n_gaussianRF=n_gaussianRF

    #Perceptron and GRF_Perceptron
    clf_13=SGDClassifier(loss='perceptron', eta0=1, learning_rate='constant', penalty=None,max_iter=1) 
    
    clf_14=GRF_SGDClassifier(loss='perceptron', eta0=1, learning_rate='constant', penalty=None,max_iter=1)
    clf_14.gamma=gamma
    clf_14.n_gaussianRF=n_gaussianRF
    
    #PassiveAggressiveClassifier and GRF_PassiveAggressiveClassifier
    clf_15=PassiveAggressiveClassifier(max_iter=1)
    
    clf_16=GRF_PassiveAggressiveClassifier(max_iter=1)
    clf_16.gamma=gamma
    clf_16.n_gaussianRF=n_gaussianRF
    
    #MLPClassifier and GRF_MLPClassifier
    clf_17=MLPClassifier(batch_size=1,max_iter=1,hidden_layer_sizes=(100,))
    
    clf_18=GRF_MLPClassifier(batch_size=1,max_iter=1,hidden_layer_sizes=(100,))
    clf_18.gamma=gamma
    clf_18.n_gaussianRF=n_gaussianRF
    
    classifiers = [clf_1,clf_2,clf_3,clf_4,clf_5,clf_6,clf_9,clf_10,clf_11,clf_12,clf_13,clf_14,clf_15,clf_16,clf_17,clf_18]
    classifiers_init = [clf_1,clf_2,clf_3,clf_4,clf_5,clf_6,clf_9,clf_10,clf_11,clf_12,clf_13,clf_14,clf_15,clf_16,clf_17,clf_18]

#    classifiers = [clf_1,clf_2]
#    classifiers_init = [clf_1,clf_2]
    
    names=[]
    for c in range(len(classifiers)):
        classifier=classifiers[c]
        class_name=''
        
        if str(classifier)[26:33]=='GRF_KNN':    
            class_name=str(classifier)[26:33]
        elif str(classifier)[22:25]=='KNN':    
            class_name=str(classifier)[22:25]
        elif str(classifier)[34:47]=='HoeffdingTree':
            class_name='HT'
        elif str(classifier)[38:55]=='GRF_HoeffdingTree':
            class_name='GRF_HT'
        elif str(classifier)[43:46]=='HAT':
            class_name=str(classifier)[43:46]
        elif str(classifier)[47:72]=='GRF_HoeffdingAdaptiveTree':
            class_name='GRF_HAT'
#        elif str(classifier)[31:41]=='NaiveBayes':
#            class_name='MNB'            
#        elif str(classifier)[35:49]=='GRF_NaiveBayes':
#            class_name='GRF_MNB'
        elif str(classifier)[0:10]=='GaussianNB':
            class_name='GNB'
        elif str(classifier)[0:14]=='GRF_GaussianNB':
            class_name='GRF_GNB'
        elif str(classifier)[0:13]=='SGDClassifier' and classifier.loss=='hinge':
            class_name='SGD'
        elif str(classifier)[0:17]=='GRF_SGDClassifier' and classifier.loss=='hinge':
            class_name='GRF_SGD'
        elif str(classifier)[0:13]=='SGDClassifier' and classifier.loss=='perceptron':
            class_name='Perceptron'
        elif str(classifier)[0:17]=='GRF_SGDClassifier' and classifier.loss=='perceptron':
            class_name='GRF_Perceptron'
        elif str(classifier)[0:27]=='PassiveAggressiveClassifier':
            class_name='PA'
        elif str(classifier)[0:31]=='GRF_PassiveAggressiveClassifier':
            class_name='GRF_PA'
        elif str(classifier)[0:13]=='MLPClassifier':
            class_name='MLP'
        elif str(classifier)[0:17]=='GRF_MLPClassifier':
            class_name='GRF_MLP'
#        elif str(classifier)[0:9]=='OnlineGRF':
#            class_name=str(classifier)[0:9]
    
        names.append(class_name)
    
    return classifiers,names,classifiers_init
예제 #24
0
class DocSearch(object):
    """
    Index a set of documents. Can provide:
        * documents that match a list of keywords
        * suggestions for user input.
        * instances of documents
    """

    INDEX_STEP_LOADING = "loading"
    INDEX_STEP_CLEANING = "cleaning"
    INDEX_STEP_CHECKING = "checking"
    INDEX_STEP_READING = "checking"
    INDEX_STEP_COMMIT = "commit"
    LABEL_STEP_UPDATING = "label updating"
    LABEL_STEP_DESTROYING = "label deletion"
    WHOOSH_SCHEMA = whoosh.fields.Schema(
        # static up to date schema
        docid=whoosh.fields.ID(stored=True, unique=True),
        doctype=whoosh.fields.ID(stored=True, unique=False),
        docfilehash=whoosh.fields.ID(stored=True),
        content=whoosh.fields.TEXT(spelling=True),
        label=whoosh.fields.KEYWORD(stored=True, commas=True,
                                    spelling=True, scorable=True),
        date=whoosh.fields.DATETIME(stored=True),
        last_read=whoosh.fields.DATETIME(stored=True),
    )
    LABEL_ESTIMATOR_TEMPLATE = PassiveAggressiveClassifier(n_iter=50)

    """
    Label_estimators is a dict with one estimator per label.
    Each label is predicted with its own estimator (OneVsAll strategy)
    We cannot use directly OneVsAllClassifier sklearn class because
    it doesn't support online learning (partial_fit)
    """
    label_estimators = {}

    def __init__(self, rootdir, callback=dummy_progress_cb):
        """
        Index files in rootdir (see constructor)

        Arguments:
            callback --- called during the indexation (may be called *often*).
                step : DocSearch.INDEX_STEP_READING or
                    DocSearch.INDEX_STEP_SORTING
                progression : how many elements done yet
                total : number of elements to do
                document (only if step == DocSearch.INDEX_STEP_READING): file
                    being read
        """
        self.rootdir = rootdir
        base_indexdir = os.getenv("XDG_DATA_HOME",
                                  os.path.expanduser("~/.local/share"))
        self.indexdir = os.path.join(base_indexdir, "paperwork", "index")
        mkdir_p(self.indexdir)

        self.__docs_by_id = {}  # docid --> doc
        self.label_list = []

        need_index_rewrite = True
        try:
            logger.info("Opening index dir '%s' ..." % self.indexdir)
            self.index = whoosh.index.open_dir(self.indexdir)
            # check that the schema is up-to-date
            # We use the string representation of the schemas, because previous
            # versions of whoosh don't always implement __eq__
            if str(self.index.schema) == str(self.WHOOSH_SCHEMA):
                need_index_rewrite = False
        except whoosh.index.EmptyIndexError, exc:
            logger.warning("Failed to open index '%s'" % self.indexdir)
            logger.warning("Exception was: %s" % str(exc))

        if need_index_rewrite:
            logger.info("Creating a new index")
            self.index = whoosh.index.create_in(self.indexdir,
                                                self.WHOOSH_SCHEMA)
            logger.info("Index '%s' created" % self.indexdir)

        self.__searcher = self.index.searcher()

        class CustomFuzzy(whoosh.qparser.query.FuzzyTerm):
            def __init__(self, fieldname, text, boost=1.0, maxdist=1,
                         prefixlength=0, constantscore=True):
                whoosh.qparser.query.FuzzyTerm.__init__(
                    self, fieldname, text, boost, maxdist,
                    prefixlength, constantscore=True
                )

        facets = [whoosh.sorting.ScoreFacet(),
                  whoosh.sorting.FieldFacet("date", reverse=True)]

        self.search_param_list = {
            'full': [
                {
                    "query_parser": whoosh.qparser.MultifieldParser(
                        ["label", "content"], schema=self.index.schema,
                        termclass=CustomFuzzy),
                    "sortedby": facets
                },
                {
                    "query_parser": whoosh.qparser.MultifieldParser(
                        ["label", "content"], schema=self.index.schema,
                        termclass=whoosh.qparser.query.Prefix),
                    "sortedby": facets
                },
            ],
            'fast': [
                {
                    "query_parser": whoosh.qparser.MultifieldParser(
                        ["label", "content"], schema=self.index.schema,
                        termclass=whoosh.query.Term),
                    "sortedby": facets
                },
            ],
        }

        self.check_workdir()
        self.cleanup_rootdir(callback)
        self.reload_index(callback)

        self.label_estimators_dir = os.path.join(
            base_indexdir, "paperwork", "label_estimators")
        self.label_estimators_file = os.path.join(
            self.label_estimators_dir, "label_estimators.jbl")
        try:
            logger.info("Opening label_estimators file '%s' ..." %
                        self.label_estimators_file)
            (l_estimators, ver) = joblib.load(self.label_estimators_file)
            if ver != BasicDoc.FEATURES_VER:
                logger.info("Estimator version is not up to date")
                self.label_estimators = {}
            else:
                self.label_estimators = l_estimators

            # check that the label_estimators are up to date for their class
            for label_name in self.label_estimators:
                params = self.label_estimators[label_name].get_params()
                if params != self.LABEL_ESTIMATOR_TEMPLATE.get_params():
                    raise IndexError('label_estimators params are not up to'
                                     + ' date')
        except Exception, exc:
            logger.error(("Failed to open label_estimator file '%s', or bad"
                          + " label_estimator structure") % self.indexdir)
            logger.error("Exception was: %s" % exc)
            logger.info("Will create new label_estimators")
            self.label_estimators = {}
예제 #25
0
if ",GNB," in Functions:
    models.append(('GNB', GaussianNB()))
if ",QDA," in Functions:
    models.append(('QDA', QuadraticDiscriminantAnalysis()))
if ",GBC," in Functions:
    models.append(('GBC', GradientBoostingClassifier()))
if ",ETC," in Functions:
    models.append(('ETC', ExtraTreeClassifier()))
if ",BC," in Functions:
    models.append(('BC', BaggingClassifier()))
if ",SGDC," in Functions:
    models.append(('SGDC', SGDClassifier()))
if ",RC," in Functions:
    models.append(('RC', RidgeClassifier()))
if ",PAC," in Functions:
    models.append(('PAC', PassiveAggressiveClassifier()))
if ",ETSC," in Functions:
    models.append(('ETSC', ExtraTreesClassifier()))
if ",BNB," in Functions:
    models.append(('BNB', BernoulliNB()))
if ",GM," in Functions:
    models.append(('GM', GaussianMixture()))

from sklearn.model_selection import KFold
from collections import Counter

Predictii = [[] for _ in range(len(Y_Test))]

Accs = []

normlist = []
예제 #26
0
class PassiveAggressive(AutoSklearnClassificationAlgorithm):
    def __init__(self,
                 C,
                 fit_intercept,
                 tol,
                 loss,
                 average,
                 random_state=None):
        self.C = float(C)
        self.fit_intercept = fit_intercept == 'True'
        self.tol = float(tol)
        self.loss = loss
        self.average = average == 'True'
        self.random_state = random_state
        self.estimator = None

    def fit(self, X, y):
        n_iter = 2
        self.iterative_fit(X, y, n_iter=n_iter, refit=True)
        while not self.configuration_fully_fitted():
            n_iter *= 2
            self.iterative_fit(X, y, n_iter=n_iter)

        return self

    def iterative_fit(self, X, y, n_iter=2, refit=False):
        from sklearn.linear_model.passive_aggressive import \
            PassiveAggressiveClassifier

        # Need to fit at least two iterations, otherwise early stopping will not
        # work because we cannot determine whether the algorithm actually
        # converged. The only way of finding this out is if the sgd spends less
        # iterations than max_iter. If max_iter == 1, it has to spend at least
        # one iteration and will always spend at least one iteration, so we
        # cannot know about convergence.

        if refit:
            self.estimator = None

        if self.estimator is None:
            call_fit = True
            self.estimator = PassiveAggressiveClassifier(
                C=self.C,
                fit_intercept=self.fit_intercept,
                max_iter=n_iter,
                tol=self.tol,
                loss=self.loss,
                shuffle=True,
                random_state=self.random_state,
                warm_start=True,
                average=self.average,
            )
            self.classes_ = np.unique(y.astype(int))
        else:
            call_fit = False

        # Fallback for multilabel classification
        if len(y.shape) > 1 and y.shape[1] > 1:
            import sklearn.multiclass
            self.estimator.max_iter = 50
            self.estimator = sklearn.multiclass.OneVsRestClassifier(
                self.estimator, n_jobs=1)
            self.estimator.fit(X, y)
            self.fully_fit_ = True
        else:
            if call_fit:
                self.estimator.fit(X, y)
            else:
                self.estimator.max_iter += n_iter
                self.estimator.max_iter = min(self.estimator.max_iter, 1000)
                self.estimator._validate_params()
                lr = "pa1" if self.estimator.loss == "hinge" else "pa2"
                self.estimator._partial_fit(X,
                                            y,
                                            alpha=1.0,
                                            C=self.estimator.C,
                                            loss="hinge",
                                            learning_rate=lr,
                                            max_iter=n_iter,
                                            classes=None,
                                            sample_weight=None,
                                            coef_init=None,
                                            intercept_init=None)
                if (self.estimator._max_iter >= 1000
                        or n_iter > self.estimator.n_iter_):
                    self.fully_fit_ = True

        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        elif not hasattr(self, 'fully_fit_'):
            return False
        else:
            return self.fully_fit_

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()

        df = self.estimator.decision_function(X)
        return softmax(df)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            'shortname': 'PassiveAggressive Classifier',
            'name': 'Passive Aggressive Classifier',
            'handles_regression': False,
            'handles_classification': True,
            'handles_multiclass': True,
            'handles_multilabel': True,
            'is_deterministic': True,
            'input': (DENSE, SPARSE, UNSIGNED_DATA),
            'output': (PREDICTIONS, )
        }

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        C = UniformFloatHyperparameter("C", 1e-5, 10, 1.0, log=True)
        fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True")
        loss = CategoricalHyperparameter("loss", ["hinge", "squared_hinge"],
                                         default_value="hinge")

        tol = UniformFloatHyperparameter("tol",
                                         1e-5,
                                         1e-1,
                                         default_value=1e-4,
                                         log=True)
        average = CategoricalHyperparameter('average', [False, True])

        cs = ConfigurationSpace()
        cs.add_hyperparameters([loss, fit_intercept, tol, C, average])
        return cs
예제 #27
0
class PassiveAggressive(AutoSklearnClassificationAlgorithm):
    def __init__(self, C, fit_intercept, n_iter, loss, random_state=None):
        super(PassiveAggressive, self).__init__()
        self.C = float(C)
        self.fit_intercept = fit_intercept == 'True'
        self.n_iter = int(n_iter)
        self.loss = loss
        self.random_state = random_state
        self.estimator = None

    def fit(self, X, y):
        while not self.configuration_fully_fitted():
            self.iterative_fit(X, y, n_iter=1)

        return self

    def iterative_fit(self, X, y, n_iter=1, refit=False):
        from sklearn.linear_model.passive_aggressive import \
            PassiveAggressiveClassifier

        if refit:
            self.estimator = None

        if self.estimator is None:
            self._iterations = 0

            self.estimator = PassiveAggressiveClassifier(
                C=self.C,
                fit_intercept=self.fit_intercept,
                n_iter=1,
                loss=self.loss,
                shuffle=True,
                random_state=self.random_state,
                warm_start=True)
            self.classes_ = np.unique(y.astype(int))

        # Fallback for multilabel classification
        if len(y.shape) > 1 and y.shape[1] > 1:
            import sklearn.multiclass
            self.estimator.n_iter = self.n_iter
            self.estimator = sklearn.multiclass.OneVsRestClassifier(
                self.estimator, n_jobs=1)
            self.estimator.fit(X, y)
            self.fully_fit_ = True
        else:
            # In the first iteration, there is not yet an intercept

            self.estimator.n_iter = n_iter
            self.estimator.partial_fit(X, y, classes=np.unique(y))
            if self._iterations >= self.n_iter:
                self.fully_fit_ = True
            self._iterations += n_iter

        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        elif not hasattr(self, 'fully_fit_'):
            return False
        else:
            return self.fully_fit_

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()

        df = self.estimator.decision_function(X)
        return softmax(df)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            'shortname': 'PassiveAggressive Classifier',
            'name': 'Passive Aggressive Classifier',
            'handles_regression': False,
            'handles_classification': True,
            'handles_multiclass': True,
            'handles_multilabel': True,
            'is_deterministic': True,
            'input': (DENSE, SPARSE, UNSIGNED_DATA),
            'output': (PREDICTIONS, )
        }

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        loss = CategoricalHyperparameter("loss", ["hinge", "squared_hinge"],
                                         default="hinge")
        fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True")
        n_iter = UniformIntegerHyperparameter("n_iter",
                                              5,
                                              1000,
                                              default=20,
                                              log=True)
        C = UniformFloatHyperparameter("C", 1e-5, 10, 1, log=True)
        cs = ConfigurationSpace()
        cs.add_hyperparameter(loss)
        cs.add_hyperparameter(fit_intercept)
        cs.add_hyperparameter(n_iter)
        cs.add_hyperparameter(C)
        return cs
예제 #28
0
class PassiveAggressive:
    def __init__(self,
                 C,
                 fit_intercept,
                 tol,
                 loss,
                 average,
                 random_state=None):
        self.C = C
        self.fit_intercept = fit_intercept
        self.average = average
        self.tol = tol
        self.loss = loss
        self.random_state = random_state
        self.estimator = None

    def fit(self, X, y, sample_weight=None):
        self.iterative_fit(X,
                           y,
                           n_iter=2,
                           refit=True,
                           sample_weight=sample_weight)
        iteration = 2
        while not self.configuration_fully_fitted():
            n_iter = int(2**iteration / 2)
            self.iterative_fit(X,
                               y,
                               n_iter=n_iter,
                               sample_weight=sample_weight)
            iteration += 1
        return self

    def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None):
        from sklearn.linear_model.passive_aggressive import \
            PassiveAggressiveClassifier

        # Need to fit at least two iterations, otherwise early stopping will not
        # work because we cannot determine whether the algorithm actually
        # converged. The only way of finding this out is if the sgd spends less
        # iterations than max_iter. If max_iter == 1, it has to spend at least
        # one iteration and will always spend at least one iteration, so we
        # cannot know about convergence.

        if refit:
            self.estimator = None

        if self.estimator is None:
            self.fully_fit_ = False

            self.average = check_for_bool(self.average)
            self.fit_intercept = check_for_bool(self.fit_intercept)
            self.tol = float(self.tol)
            self.C = float(self.C)

            call_fit = True
            self.estimator = PassiveAggressiveClassifier(
                C=self.C,
                fit_intercept=self.fit_intercept,
                max_iter=n_iter,
                tol=self.tol,
                loss=self.loss,
                shuffle=True,
                random_state=self.random_state,
                warm_start=True,
                average=self.average,
            )
            self.classes_ = np.unique(y.astype(int))
        else:
            call_fit = False

        # Fallback for multilabel classification
        if len(y.shape) > 1 and y.shape[1] > 1:
            import sklearn.multiclass
            self.estimator.max_iter = 50
            self.estimator = sklearn.multiclass.OneVsRestClassifier(
                self.estimator, n_jobs=1)
            self.estimator.fit(X, y)
            self.fully_fit_ = True
        else:
            if call_fit:
                self.estimator.fit(X, y)
            else:
                self.estimator.max_iter += n_iter
                self.estimator.max_iter = min(self.estimator.max_iter, 1000)
                self.estimator._validate_params()
                lr = "pa1" if self.estimator.loss == "hinge" else "pa2"
                self.estimator._partial_fit(X,
                                            y,
                                            alpha=1.0,
                                            C=self.estimator.C,
                                            loss="hinge",
                                            learning_rate=lr,
                                            max_iter=n_iter,
                                            classes=None,
                                            sample_weight=sample_weight,
                                            coef_init=None,
                                            intercept_init=None)
                if (self.estimator._max_iter >= 1000
                        or n_iter > self.estimator.n_iter_):
                    self.fully_fit_ = True

        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        elif not hasattr(self, 'fully_fit_'):
            return False
        else:
            return self.fully_fit_

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()

        df = self.estimator.decision_function(X)
        return softmax(df)
예제 #29
0
			'NMF':NMF(),
			'NearestCentroid':NearestCentroid(),
			'NearestNeighbors':NearestNeighbors(),
			'Normalizer':Normalizer(),
			'NuSVC':NuSVC(),
			'NuSVR':NuSVR(),
			'Nystroem':Nystroem(),
			'OAS':OAS(),
			'OneClassSVM':OneClassSVM(),
			'OrthogonalMatchingPursuit':OrthogonalMatchingPursuit(),
			'OrthogonalMatchingPursuitCV':OrthogonalMatchingPursuitCV(),
			'PCA':PCA(),
			'PLSCanonical':PLSCanonical(),
			'PLSRegression':PLSRegression(),
			'PLSSVD':PLSSVD(),
			'PassiveAggressiveClassifier':PassiveAggressiveClassifier(),
			'PassiveAggressiveRegressor':PassiveAggressiveRegressor(),
			'Perceptron':Perceptron(),
			'ProjectedGradientNMF':ProjectedGradientNMF(),
			'QuadraticDiscriminantAnalysis':QuadraticDiscriminantAnalysis(),
			'RANSACRegressor':RANSACRegressor(),
			'RBFSampler':RBFSampler(),
			'RadiusNeighborsClassifier':RadiusNeighborsClassifier(),
			'RadiusNeighborsRegressor':RadiusNeighborsRegressor(),
			'RandomForestClassifier':RandomForestClassifier(),
			'RandomForestRegressor':RandomForestRegressor(),
			'RandomizedLasso':RandomizedLasso(),
			'RandomizedLogisticRegression':RandomizedLogisticRegression(),
			'RandomizedPCA':RandomizedPCA(),
			'Ridge':Ridge(),
			'RidgeCV':RidgeCV(),
예제 #30
0
    def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None):
        from sklearn.linear_model.passive_aggressive import \
            PassiveAggressiveClassifier

        # Need to fit at least two iterations, otherwise early stopping will not
        # work because we cannot determine whether the algorithm actually
        # converged. The only way of finding this out is if the sgd spends less
        # iterations than max_iter. If max_iter == 1, it has to spend at least
        # one iteration and will always spend at least one iteration, so we
        # cannot know about convergence.

        if refit:
            self.estimator = None

        if self.estimator is None:
            self.fully_fit_ = False

            self.average = check_for_bool(self.average)
            self.fit_intercept = check_for_bool(self.fit_intercept)
            self.tol = float(self.tol)
            self.C = float(self.C)

            call_fit = True
            self.estimator = PassiveAggressiveClassifier(
                C=self.C,
                fit_intercept=self.fit_intercept,
                max_iter=n_iter,
                tol=self.tol,
                loss=self.loss,
                shuffle=True,
                random_state=self.random_state,
                warm_start=True,
                average=self.average,
            )
            self.classes_ = np.unique(y.astype(int))
        else:
            call_fit = False

        # Fallback for multilabel classification
        if len(y.shape) > 1 and y.shape[1] > 1:
            import sklearn.multiclass
            self.estimator.max_iter = 50
            self.estimator = sklearn.multiclass.OneVsRestClassifier(
                self.estimator, n_jobs=1)
            self.estimator.fit(X, y)
            self.fully_fit_ = True
        else:
            if call_fit:
                self.estimator.fit(X, y)
            else:
                self.estimator.max_iter += n_iter
                self.estimator.max_iter = min(self.estimator.max_iter,
                                              1000)
                self.estimator._validate_params()
                lr = "pa1" if self.estimator.loss == "hinge" else "pa2"
                self.estimator._partial_fit(
                    X, y,
                    alpha=1.0,
                    C=self.estimator.C,
                    loss="hinge",
                    learning_rate=lr,
                    max_iter=n_iter,
                    classes=None,
                    sample_weight=sample_weight,
                    coef_init=None,
                    intercept_init=None
                )
                if (
                    self.estimator._max_iter >= 1000
                    or n_iter > self.estimator.n_iter_
                ):
                    self.fully_fit_ = True

        return self
class PassiveAggressive(AutoSklearnClassificationAlgorithm):
    def __init__(self, C, fit_intercept, n_iter, loss, random_state=None):
        super(PassiveAggressive, self).__init__()
        self.C = float(C)
        self.fit_intercept = fit_intercept == 'True'
        self.n_iter = int(n_iter)
        self.loss = loss
        self.random_state = random_state
        self.estimator = None

    def fit(self, X, y):
        while not self.configuration_fully_fitted():
            self.iterative_fit(X, y, n_iter=1)

        return self

    def iterative_fit(self, X, y, n_iter=1, refit=False):
        from sklearn.linear_model.passive_aggressive import \
            PassiveAggressiveClassifier

        if refit:
            self.estimator = None

        if self.estimator is None:
            self._iterations = 0

            self.estimator = PassiveAggressiveClassifier(
                C=self.C, fit_intercept=self.fit_intercept, n_iter=1,
                loss=self.loss, shuffle=True, random_state=self.random_state,
                warm_start=True)
            self.classes_ = np.unique(y.astype(int))

        # Fallback for multilabel classification
        if len(y.shape) > 1 and y.shape[1] > 1:
            import sklearn.multiclass
            self.estimator.n_iter = self.n_iter
            self.estimator = sklearn.multiclass.OneVsRestClassifier(
                self.estimator, n_jobs=1)
            self.estimator.fit(X, y)
            self.fully_fit_ = True
        else:
            # In the first iteration, there is not yet an intercept

            self.estimator.n_iter = n_iter
            self.estimator.partial_fit(X, y, classes=np.unique(y))
            if self._iterations >= self.n_iter:
                self.fully_fit_ = True
            self._iterations += n_iter

        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        elif not hasattr(self, 'fully_fit_'):
            return False
        else:
            return self.fully_fit_

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()

        df = self.estimator.decision_function(X)
        return softmax(df)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {'shortname': 'PassiveAggressive Classifier',
                'name': 'Passive Aggressive Classifier',
                'handles_regression': False,
                'handles_classification': True,
                'handles_multiclass': True,
                'handles_multilabel': True,
                'is_deterministic': True,
                'input': (DENSE, SPARSE, UNSIGNED_DATA),
                'output': (PREDICTIONS,)}

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        loss = CategoricalHyperparameter("loss",
                                         ["hinge", "squared_hinge"],
                                         default="hinge")
        fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True")
        n_iter = UniformIntegerHyperparameter("n_iter", 5, 1000, default=20,
                                              log=True)
        C = UniformFloatHyperparameter("C", 1e-5, 10, 1, log=True)
        cs = ConfigurationSpace()
        cs.add_hyperparameter(loss)
        cs.add_hyperparameter(fit_intercept)
        cs.add_hyperparameter(n_iter)
        cs.add_hyperparameter(C)
        return cs