예제 #1
0
 def test_fit_biginc(self):
     model = hyperopt_estimator(
         classifier=components.any_classifier('classifier'),
         verbose=1, max_evals=5, trial_timeout=5.0, fit_increment=20)
     model.fit(self.X, self.Y)
     # -- make sure we only get 5 even with big fit_increment
     assert len(model.trials.trials) == 5
예제 #2
0
    def test_sparse_random_projection(self):
        # restrict n_components to be less than or equal to data dimension
        # to prevent sklearn warnings from printing during tests
        n_components = scope.int(
            hp.quniform('preprocessing.n_components', low=1, high=8, q=1))
        model = hyperopt_estimator(
            classifier=components.gaussian_nb('classifier'),
            preprocessing=[
                components.sparse_random_projection(
                    'preprocessing',
                    n_components=n_components,
                )
            ],
            algo=rand.suggest,
            trial_timeout=5.0,
            max_evals=5,
        )

        X_train = np.random.randn(1000, 8)
        Y_train = (self.X_train[:, 0] > 0).astype('int')
        X_test = np.random.randn(1000, 8)
        Y_test = (self.X_test[:, 0] > 0).astype('int')

        model.fit(X_train, Y_train)
        model.score(X_test, Y_test)
 def test_fit_iter_basic(self):
     model = hyperopt_estimator(verbose=1, trial_timeout=5.0)
     for ii, trials in enumerate(model.fit_iter(self.X, self.Y)):
         assert trials is model.trials
         assert len(trials.trials) == ii
         if ii == 10:
             break
def sklearn_digits( classifier, algorithm, max_evals=100, seed=1,
                    filename = 'none', preproc=[], loss=None ):

  global suppress_output
  if suppress_output:
    dump_file = None
  else:
    dump_file = filename+'.dump'
  
  estim = hyperopt_estimator( classifier=classifier, algo=algorithm,
                              preprocessing=preproc,
                              max_evals=max_evals, trial_timeout=60,
                              fit_increment_dump_filename=dump_file,
                              loss_fn=loss, verbose=1)
  
  filename = filename + '.out'

  digits = load_digits()

  X = digits.data
  y = digits.target

  test_size = int( 0.2 * len( y ) )
  np.random.seed( seed )
  indices = np.random.permutation(len(X))
  X_train = X[ indices[:-test_size]]
  y_train = y[ indices[:-test_size]]
  X_test = X[ indices[-test_size:]]
  y_test = y[ indices[-test_size:]]


  print(y_train.shape)
  print(y_test.shape)
  
  find_model( X_train, y_train, X_test, y_test, estim, filename )
예제 #5
0
 def test_fit_iter_basic(self):
     model = hyperopt_estimator(verbose=1, trial_timeout=5.0)
     for ii, trials in enumerate(model.fit_iter(self.X, self.Y)):
         assert trials is model.trials
         assert len(trials.trials) == ii
         if ii == 10:
             break
예제 #6
0
def test_continuous_loss_fn():
    """
    Demonstrate using a custom loss function with the continuous_loss_fn
    option.
    """

    from sklearn.metrics import log_loss

    # Generate some random data
    X = np.hstack([
        np.vstack([
            np.random.normal(0,1,size=(1000,10)),
            np.random.normal(1,1,size=(1000,10)),
        ]),
        np.random.normal(0,1,size=(2000,10)),
    ])
    y = np.zeros(2000)
    y[:1000] = 1

    def loss_function(targ, pred):
        # hyperopt_estimator flattens the prediction when saving it.  This also
        # affects multilabel classification.
        pred = pred.reshape( (-1, 2) )
        return log_loss(targ, pred[:,1])

    # Try to fit an SGD model using log_loss as the loss function
    cls = hyperopt_estimator(
        classifier=components.sgd('sgd', loss='log'),
        preprocessing=[],
        loss_fn = loss_function,
        continuous_loss_fn=True,
    )
    cls.fit(X,y,cv_shuffle=True)
예제 #7
0
    def train_one_model(
        name_classifier,
        X: np.array,
        y: np.array,
        mix_algo,
        max_evals: int,
        timeout: int,
        n: int,
    ) -> Tuple:
        name, classifier = name_classifier

        trainlogger.info("i'm using a timeout of {}".format(timeout))
        m = hyperopt_estimator(
            classifier=classifier("classifier"),
            algo=mix_algo,
            trial_timeout=timeout,
            preprocessing=[],
            max_evals=max_evals,
            loss_fn=
            f1lossfn,  # f1 macro is probably more meaningfull than accuracy
            # continuous_loss_fn = True,
            seed=RANDOM_SEED,
        )

        trainlogger.info("training {}".format(name))
        m.fit(
            X, y, cv_shuffle=True,
            n_folds=n)  # hyperopt-sklearn takes care of the cross validations

        m.retrain_best_model_on_full_data(X, y)

        m = m.best_model()["learner"]

        return (name, m)
def sklearn_digits( classifier=None ):
  #estim = hyperopt_estimator( classifier=any_classifier('hai'), algo=tpe.suggest )
  if classifier is None:
    classifier = any_classifier('any')
  estim = hyperopt_estimator( classifier=classifier )

  digits = load_digits()
  X = digits.data
  y = digits.target

  test_size = 50
  np.random.seed(0)
  indices = np.random.permutation(len(X))
  X_train = X[ indices[:-test_size]]
  y_train = y[ indices[:-test_size]]
  X_test = X[ indices[-test_size:]]
  y_test = y[ indices[-test_size:]]

  estim.fit( X_train, y_train )

  pred = estim.predict( X_test )
  print( pred )
  print ( y_test )

  print( score( pred, y_test ) ) 
  
  print( estim.best_model() )
예제 #9
0
def test_sparse_input():
    """
    Ensure the estimator can handle sparse X matrices.
    """

    import scipy.sparse as ss

    # Generate some random sparse data
    nrows,ncols,nnz = 100,50,10
    ntrue = nrows // 2
    D,C,R = [],[],[]
    for r in range(nrows):
        feats = np.random.choice(range(ncols), size=nnz, replace=False)
        D.extend([1]*nnz)
        C.extend(feats)
        R.extend([r]*nnz)
    X = ss.csr_matrix( (D,(R,C)), shape=(nrows, ncols))
    y = np.zeros( nrows )
    y[:ntrue] = 1


    # Try to fit an SGD model
    cls = hyperopt_estimator(
        classifier=components.sgd('sgd', loss='log'),
        preprocessing=[],
    )
    cls.fit(X,y)
예제 #10
0
def test_sparse_input():
    """
    Ensure the estimator can handle sparse X matrices.
    """

    import scipy.sparse as ss

    # Generate some random sparse data
    nrows, ncols, nnz = 100, 50, 10
    ntrue = nrows // 2
    D, C, R = [], [], []
    for r in range(nrows):
        feats = np.random.choice(range(ncols), size=nnz, replace=False)
        D.extend([1] * nnz)
        C.extend(feats)
        R.extend([r] * nnz)
    X = ss.csr_matrix((D, (R, C)), shape=(nrows, ncols))
    y = np.zeros(nrows)
    y[:ntrue] = 1

    # Try to fit an SGD model
    cls = hyperopt_estimator(
        classifier=components.sgd('sgd', loss='log'),
        preprocessing=[],
    )
    cls.fit(X, y)
    def test_sparse_random_projection(self):
        # restrict n_components to be less than or equal to data dimension
        # to prevent sklearn warnings from printing during tests
        n_components = scope.int(hp.quniform(
            'preprocessing.n_components', low=1, high=8, q=1
        ))
        model = hyperopt_estimator(
            classifier=components.gaussian_nb('classifier'),
            preprocessing=[
                components.sparse_random_projection(
                    'preprocessing',
                    n_components=n_components,
                )
            ],
            algo=rand.suggest,
            trial_timeout=5.0,
            max_evals=5,
        )

        X_train = np.random.randn(1000, 8)
        Y_train = (self.X_train[:, 0] > 0).astype('int')
        X_test = np.random.randn(1000, 8)
        Y_test = (self.X_test[:, 0] > 0).astype('int')

        model.fit(X_train, Y_train)
        model.score(X_test, Y_test)
예제 #12
0
def test_continuous_loss_fn():
    """
    Demonstrate using a custom loss function with the continuous_loss_fn
    option.
    """

    from sklearn.metrics import log_loss

    # Generate some random data
    X = np.hstack([
        np.vstack([
            np.random.normal(0, 1, size=(1000, 10)),
            np.random.normal(1, 1, size=(1000, 10)),
        ]),
        np.random.normal(0, 1, size=(2000, 10)),
    ])
    y = np.zeros(2000)
    y[:1000] = 1

    def loss_function(targ, pred):
        # hyperopt_estimator flattens the prediction when saving it.  This also
        # affects multilabel classification.
        pred = pred.reshape((-1, 2))
        return log_loss(targ, pred[:, 1])

    # Try to fit an SGD model using log_loss as the loss function
    cls = hyperopt_estimator(
        classifier=components.sgd('sgd', loss='log'),
        preprocessing=[],
        loss_fn=loss_function,
        continuous_loss_fn=True,
    )
    cls.fit(X, y, cv_shuffle=True)
예제 #13
0
 def test_smoke(self):
     # -- verify the space argument is accepted and runs
     space = components.generic_space()
     model = hyperopt_estimator(verbose=1,
                                max_evals=10,
                                trial_timeout=5,
                                space=space)
     model.fit(self.X, self.Y)
예제 #14
0
 def test_fit(self):
     model = hyperopt_estimator(
         classifier=components.any_classifier('classifier'),
         verbose=1,
         max_evals=5,
         trial_timeout=5.0)
     model.fit(self.X, self.Y)
     assert len(model.trials.trials) == 5
예제 #15
0
 def test_regressor(self):
     model = hyperopt_estimator(regressor=reg_fn('regressor'),
                                preprocessing=[],
                                algo=rand.suggest,
                                trial_timeout=50.0,
                                max_evals=2,
                                verbose=True)
     model.fit(self.X_train, self.Y_train)
     model.score(self.X_test, self.Y_test)
예제 #16
0
def hyperopt_850_556():
    # Load data
    dir_key = '1406'
    data_key = '850+556'
    dir_path = dir_path_dict[dir_key]
    data_str = dir_path + data_str_dict[data_key]

    # Redirect stdout to file
    stdout_path = 'outcome_hyperopt_any.any.txt'
    print '[INFO]  stdout_path:\t{}'.format(stdout_path)
    sys.stdout = open(stdout_path, 'w')
    print "[INFO]  params:\talgo=tpe.suggest"

    # Train
    scores = []
    sensis = []
    specis = []
    for i in range(10):
        # Load data
        data_path = data_str.format(i + 1)
        print data_path
        trainset, testset = get_dataset(data_path=data_path)
        train_data, train_label = trainset
        test_data, test_label = testset

        # Create the estimator object
        # estim = hyperopt_estimator(classifier=any_classifier('mySVC'),
        # algo=tpe.suggest,
        #                            preprocessing=[standard_scaler('std_scl')])
        estim = hyperopt_estimator(algo=tpe.suggest, seed=RANDOM_SEED)

        # Search the space of classifiers and preprocessing steps and their
        # respective hyperparameters in sklearn to fit a model to the data
        estim.fit(train_data, train_label)

        # show instances of the best classifier
        model = estim.best_model()
        print model

        # Make a prediction using the optimized model
        prediction = estim.predict(test_data)
        error = np.count_nonzero(prediction - test_label) / test_data.shape[0]
        sensi, speci = my_scores(test_label, prediction)
        print 1 - error, sensi, speci

        # Report the accuracy of the classifier on a given set of data
        score = estim.score(test_data, test_label)
        print score

        scores.append(score)
        sensis.append(sensi)
        specis.append(speci)

    print scores
    print "accur:\t{}\tstd:\t{}".format(np.mean(scores), np.std(scores))
    print "sensi:\t{}".format(np.mean(sensis))
    print "speci:\t{}".format(np.mean(specis))
예제 #17
0
 def test_classifier(self):
     model = hyperopt_estimator(
         classifier=clf_fn('classifier'),
         preprocessing=[],
         algo=rand.suggest,
         trial_timeout=5.0,
         max_evals=5,
     )
     model.fit(self.X_train, self.Y_train_multilabel)
     model.score(self.X_test, self.Y_test_multilabel)
 def test_classifier(self):
     model = hyperopt_estimator(
         classifier=clf_fn('classifier'),
         preprocessing=[],
         algo=rand.suggest,
         trial_timeout=5.0,
         max_evals=5,
     )
     model.fit(self.X_train, self.Y_train_multilabel)
     model.score(self.X_test, self.Y_test_multilabel)
예제 #19
0
 def test_regressor(self):
     model = hyperopt_estimator(
         regressor=reg_fn('regressor'),
         preprocessing=[],
         algo=rand.suggest,
         trial_timeout=5.0,
         max_evals=5,
     )
     model.fit(self.X_train, self.Y_train)
     model.score(self.X_test, self.Y_test)
예제 #20
0
 def test_preprocessing(self):
     model = hyperopt_estimator(
         classifier=components.gaussian_nb('classifier'),
         preprocessing=[pre_fn('preprocessing')],
         algo=rand.suggest,
         trial_timeout=5.0,
         max_evals=5,
     )
     model.fit(self.X_train, self.Y_train)
     model.score(self.X_test, self.Y_test)
 def test_preprocessing(self):
     model = hyperopt_estimator(
         classifier=components.gaussian_nb('classifier'),
         preprocessing=[pre_fn('preprocessing')],
         algo=rand.suggest,
         trial_timeout=5.0,
         max_evals=5,
     )
     model.fit(self.X_train, self.Y_train)
     model.score(self.X_test, self.Y_test)
 def test_multinomial_nb(self):
     model = hyperopt_estimator(
         classifier=components.multinomial_nb('classifier'),
         preprocessing=[],
         algo=rand.suggest,
         trial_timeout=5.0,
         max_evals=5,
     )
     
     # Inputs for MultinomialNB must be non-negative
     model.fit(np.abs(self.X_train), self.Y_train)
     model.score(np.abs(self.X_test), self.Y_test)
예제 #23
0
    def test_multinomial_nb(self):
        model = hyperopt_estimator(
            classifier=components.multinomial_nb('classifier'),
            preprocessing=[],
            algo=rand.suggest,
            trial_timeout=5.0,
            max_evals=5,
        )

        # Inputs for MultinomialNB must be non-negative
        model.fit(np.abs(self.X_train), self.Y_train)
        model.score(np.abs(self.X_test), self.Y_test)
예제 #24
0
def sklearn_newsgroups(classifier,
                       algorithm,
                       max_evals=100,
                       seed=1,
                       filename='none',
                       preproc=[],
                       loss=None):

    global suppress_output
    if suppress_output:
        dump_file = None
    else:
        dump_file = filename + '.dump'

    estim = hyperopt_estimator(classifier=classifier,
                               algo=algorithm,
                               preprocessing=[tfidf('tfidf')],
                               max_evals=max_evals,
                               trial_timeout=240,
                               fit_increment_dump_filename=dump_file,
                               loss_fn=loss)

    filename = filename + '.out'

    if REMOVE_HEADERS:
        train = fetch_20newsgroups(subset='train',
                                   remove=('headers', 'footers', 'quotes'))
        test = fetch_20newsgroups(subset='test',
                                  remove=('headers', 'footers', 'quotes'))
    else:
        train = fetch_20newsgroups(subset='train')
        test = fetch_20newsgroups(subset='test')

    if PRE_VECTORIZE:

        vectorizer = TfidfVectorizer()
        X_train = vectorizer.fit_transform(train.data)
        y_train = train.target

        X_test = vectorizer.transform(test.data)
        y_test = test.target

    else:

        X_train = train.data
        y_train = train.target

        X_test = test.data
        y_test = test.target

    print(y_train.shape)
    print(y_test.shape)
    find_model(X_train, y_train, X_test, y_test, estim, filename)
예제 #25
0
    def test_one_hot_encoder(self):
        # requires a classifier that can handle sparse data
        model = hyperopt_estimator(
            classifier=components.multinomial_nb('classifier'),
            preprocessing=[components.one_hot_encoder('preprocessing')],
            algo=rand.suggest,
            trial_timeout=5.0,
            max_evals=5,
        )

        # Inputs for one_hot_encoder must be non-negative integers
        model.fit(np.abs(np.round(self.X_test).astype(np.int)), self.Y_test)
        model.score(np.abs(np.round(self.X_test).astype(np.int)), self.Y_test)
 def test_one_hot_encoder(self):
     # requires a classifier that can handle sparse data
     model = hyperopt_estimator(
         classifier=components.multinomial_nb('classifier'),
         preprocessing=[components.one_hot_encoder('preprocessing')],
         algo=rand.suggest,
         trial_timeout=5.0,
         max_evals=5,
     )
     
     # Inputs for one_hot_encoder must be non-negative integers
     model.fit(np.abs(np.round(self.X_test).astype(np.int)), self.Y_test)
     model.score(np.abs(np.round(self.X_test).astype(np.int)), self.Y_test)
예제 #27
0
 def test_warm_start(self):
     model = hyperopt_estimator(
         classifier=components.any_classifier('classifier'), 
         verbose=1, max_evals=5, trial_timeout=5.0)
     params = model.get_params()
     assert params['algo'] == rand.suggest
     assert params['max_evals'] == 5
     model.fit(self.X, self.Y, warm_start=False)
     assert len(model.trials.trials) == 5
     model.set_params(algo=tpe.suggest, max_evals=10)
     params = model.get_params()
     assert params['algo'] == tpe.suggest
     assert params['max_evals'] == 10
     model.fit(self.X, self.Y, warm_start=True)
     assert len(model.trials.trials) == 15  # 5 + 10 = 15.
def sklearn_newsgroups( classifier, algorithm, max_evals=100, seed=1,
                        filename='none', preproc=[], loss=None ):

  global suppress_output
  if suppress_output:
    dump_file = None
  else:
    dump_file = filename+'.dump'

  estim = hyperopt_estimator( classifier=classifier, algo=algorithm,
                              preprocessing=[tfidf('tfidf')],
                              max_evals=max_evals, trial_timeout=240,
                              fit_increment_dump_filename=dump_file,
                              loss_fn=loss)
  
  filename = filename + '.out'
  
  if REMOVE_HEADERS:
    train = fetch_20newsgroups( subset='train', 
                                remove=('headers', 'footers', 'quotes') )
    test = fetch_20newsgroups( subset='test', 
                               remove=('headers', 'footers', 'quotes') )
  else:
    train = fetch_20newsgroups( subset='train' )
    test = fetch_20newsgroups( subset='test' )
  
  
  if PRE_VECTORIZE:
    
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform( train.data )
    y_train = train.target

    X_test = vectorizer.transform( test.data )
    y_test = test.target

  else:

    X_train = train.data
    y_train = train.target

    X_test = test.data
    y_test = test.target

  print(y_train.shape)
  print(y_test.shape)
  find_model( X_train, y_train, X_test, y_test, estim, filename )
예제 #29
0
def sklearn_convex(classifier,
                   algorithm,
                   max_evals=100,
                   seed=1,
                   filename='none',
                   preproc=[],
                   loss=None):

    global suppress_output
    if suppress_output:
        dump_file = None
    else:
        dump_file = filename + '.dump'

    estim = hyperopt_estimator(classifier=classifier,
                               algo=algorithm,
                               preprocessing=preproc,
                               max_evals=max_evals,
                               trial_timeout=240,
                               fit_increment_dump_filename=dump_file,
                               loss_fn=loss)

    filename = filename + '.out'

    dataset_store.download('convex')
    trainset, validset, testset = dataset_store.get_classification_problem(
        'convex')

    X_train = trainset.data.mem_data[0]
    y_train = trainset.data.mem_data[1]

    X_valid = validset.data.mem_data[0]
    y_valid = validset.data.mem_data[1]

    X_test = testset.data.mem_data[0]
    y_test = testset.data.mem_data[1]

    X_fulltrain = np.concatenate((X_train, X_valid))
    y_fulltrain = np.concatenate((y_train, y_valid))

    print(y_train.shape)
    print(y_valid.shape)
    print(y_test.shape)

    #find_model( X_train, y_train, X_test, y_test, estim, filename )
    find_model(X_fulltrain, y_fulltrain, X_test, y_test, estim, filename)
예제 #30
0
def sklearn_digits(classifier,
                   algorithm,
                   max_evals=100,
                   seed=1,
                   filename='none',
                   preproc=[],
                   loss=None):

    global suppress_output
    if suppress_output:
        dump_file = None
    else:
        dump_file = filename + '.dump'

    estim = hyperopt_estimator(classifier=classifier,
                               algo=algorithm,
                               preprocessing=preproc,
                               max_evals=max_evals,
                               trial_timeout=60,
                               fit_increment_dump_filename=dump_file,
                               loss_fn=loss,
                               verbose=1)

    filename = filename + '.out'

    digits = load_digits()

    X = digits.data
    y = digits.target

    test_size = int(0.2 * len(y))
    np.random.seed(seed)
    indices = np.random.permutation(len(X))
    X_train = X[indices[:-test_size]]
    y_train = y[indices[:-test_size]]
    X_test = X[indices[-test_size:]]
    y_test = y[indices[-test_size:]]

    print(y_train.shape)
    print(y_test.shape)

    find_model(X_train, y_train, X_test, y_test, estim, filename)
예제 #31
0
    def test_tfidf(self):
        # requires a classifier that can handle sparse data
        model = hyperopt_estimator(
            classifier=components.multinomial_nb('classifier'),
            preprocessing=[components.tfidf('preprocessing')],
            algo=rand.suggest,
            trial_timeout=5.0,
            max_evals=5,
        )

        X = np.array([
            'This is the first document.',
            'This document is the second document.',
            'And this is the third one.',
            'Is this the first document?',
        ])

        Y = np.array([0, 1, 2, 0])

        model.fit(X, Y)
        model.score(X, Y)
예제 #32
0
def test_crossvalidation():
    """
    Demonstrate performing a k-fold CV using the fit() method.
    """
    # Generate some random data
    X = np.hstack([
        np.vstack([
            np.random.normal(0,1,size=(1000,10)),
            np.random.normal(1,1,size=(1000,10)),
        ]),
        np.random.normal(0,1,size=(2000,10)),
    ])
    y = np.zeros(2000)
    y[:1000] = 1

    # Try to fit a model
    cls = hyperopt_estimator(
        classifier=components.sgd('sgd', loss='log'),
        preprocessing=[],
    )
    cls.fit(X,y,cv_shuffle=True, n_folds=5)
    def test_tfidf(self):
        # requires a classifier that can handle sparse data
        model = hyperopt_estimator(
            classifier=components.multinomial_nb('classifier'),
            preprocessing=[components.tfidf('preprocessing')],
            algo=rand.suggest,
            trial_timeout=5.0,
            max_evals=5,
        )

        X = np.array([
            'This is the first document.',
            'This document is the second document.',
            'And this is the third one.',
            'Is this the first document?',
        ])

        Y = np.array([0, 1, 2, 0])
        
        model.fit(X, Y)
        model.score(X, Y)
예제 #34
0
def test_crossvalidation():
    """
    Demonstrate performing a k-fold CV using the fit() method.
    """
    # Generate some random data
    X = np.hstack([
        np.vstack([
            np.random.normal(0, 1, size=(1000, 10)),
            np.random.normal(1, 1, size=(1000, 10)),
        ]),
        np.random.normal(0, 1, size=(2000, 10)),
    ])
    y = np.zeros(2000)
    y[:1000] = 1

    # Try to fit a model
    cls = hyperopt_estimator(
        classifier=components.sgd('sgd', loss='log'),
        preprocessing=[],
    )
    cls.fit(X, y, cv_shuffle=True, n_folds=5)
def sklearn_convex( classifier, algorithm, max_evals=100, seed=1,
                    filename = 'none', preproc=[], loss=None ):

  
  global suppress_output
  if suppress_output:
    dump_file = None
  else:
    dump_file = filename+'.dump'
  
  estim = hyperopt_estimator( classifier=classifier, algo=algorithm,
                              preprocessing=preproc,
                              max_evals=max_evals, trial_timeout=240,
                              fit_increment_dump_filename=dump_file,
                              loss_fn=loss)
  
  filename = filename + '.out'

  dataset_store.download('convex')
  trainset,validset,testset = dataset_store.get_classification_problem('convex')

  X_train = trainset.data.mem_data[0]
  y_train = trainset.data.mem_data[1]
  
  X_valid = validset.data.mem_data[0]
  y_valid = validset.data.mem_data[1]
  
  X_test = testset.data.mem_data[0]
  y_test = testset.data.mem_data[1]

  X_fulltrain = np.concatenate((X_train, X_valid))
  y_fulltrain = np.concatenate((y_train, y_valid))

  print(y_train.shape)
  print(y_valid.shape)
  print(y_test.shape)
  
  #find_model( X_train, y_train, X_test, y_test, estim, filename )
  find_model( X_fulltrain, y_fulltrain, X_test, y_test, estim, filename )
def mnist_digits():
  estim = hyperopt_estimator( classifier=any_classifier('hai') )

  digits = fetch_mldata('MNIST original')
  X = digits.data
  y = digits.target

  test_size = int( 0.2 * len( y ) )
  np.random.seed(0)
  indices = np.random.permutation(len(X))
  X_train = X[ indices[:-test_size]]
  y_train = y[ indices[:-test_size]]
  X_test = X[ indices[-test_size:]]
  y_test = y[ indices[-test_size:]]

  estim.fit( X_train, y_train )

  pred = estim.predict( X_test )
  print( pred )
  print ( y_test )

  print( score( pred, y_test ) ) 

  print( estim.best_model() )
def main():
    experiment = Experiment(
        api_key=os.getenv("COMET_API_KEY", None),
        project_name="mof-oxidation-states",
    )

    print("Loading Data")
    X_train = np.load(FEAT_TRAIN_PATH)
    X_valid = np.load(FEAT_VALID_PATH)
    X_test = np.load(FEAT_TEST_PATH)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_valid = scaler.transform(X_valid)
    X_test = scaler.transform(X_test)

    joblib.dump(
        scaler, os.path.join("models",
                             STARTTIMESTRING + "_" + "scaler.joblib"))
    # vt = VarianceThreshold(0.1)
    # X_train = vt.fit_transform(X_train)
    # X_valid = vt.transform(X_valid)
    # X_test = vt.transform(X_test)

    # joblib.dump(vt, os.path.join("models", STARTTIMESTRING + "_" + "vt.joblib"))

    y_train = np.load(LABEL_TRAIN_PATH)
    y_valid = np.load(LABEL_VALID_PATH)
    y_test = np.load(LABEL_TEST_PATH)

    optimized_models = []

    mix_algo = partial(
        mix.suggest,
        p_suggest=[
            (0.15, rand.suggest),
            (0.7, tpe.suggest),
            (0.15, anneal.suggest),
        ],
    )

    print("Optimizing classifiers")
    for name, classifier in CLASSIFIERS:
        m = hyperopt_estimator(
            classifier=classifier("classifier"),
            algo=mix_algo,
            trial_timeout=TIMEOUT,
            loss_fn=f1_loss,
            preprocessing=[],
            max_evals=MAX_EVALS,
            seed=RANDOM_SEED,
        )

        # Random undersampling to make the base estimators even more uncorrelated
        X_train_, y_train_ = undersample_2(X_train, y_train)
        m.fit(
            np.vstack([X_train_, X_valid]),
            np.vstack([y_train_.reshape(-1, 1),
                       y_valid.reshape(-1, 1)]),
            valid_size=len(X_valid),
            cv_shuffle=False,
        )

        m.retrain_best_model_on_full_data(X_train_, y_train_)

        m = m.best_model()["learner"]

        optimized_models.append((name, m))

    model_eval(
        optimized_models,
        X_train,
        y_train,
        X_test,
        y_test,
        outdir_metrics="metrics",
        outdir_models="models",
    )

    vc = VotingClassifier(optimized_models, voting="soft")

    vc._calibrate_base_estimators("sigmoid", X_valid, y_valid)

    model_eval(
        [("ensemble", vc)],
        X_train,
        y_train,
        X_test,
        y_test,
        outdir_metrics="metrics",
        outdir_models="models",
    )
예제 #38
0
 def test_fit(self):
     model = hyperopt_estimator(
         classifier=components.any_classifier('classifier'), 
         verbose=1, max_evals=5, trial_timeout=5.0)
     model.fit(self.X, self.Y)
     assert len(model.trials.trials) == 5
예제 #39
0
 def test_fit(self):
     model = hyperopt_estimator(verbose=1, max_evals=5, trial_timeout=5.0)
     model.fit(self.X, self.Y)
     assert len(model.trials.trials) == 5
예제 #40
0
 def test_smoke(self):
     # -- verify the space argument is accepted and runs
     space = components.generic_space()
     model = hyperopt_estimator(
         verbose=1, max_evals=10, trial_timeout=5, space=space)
     model.fit(self.X, self.Y)
예제 #41
0
 def test_fit(self):
     model = hyperopt_estimator(verbose=1, max_evals=5, trial_timeout=5.0)
     model.fit(self.X, self.Y)
     assert len(model.trials.trials) == 5
    def tune_fit(  # pylint:disable=dangerous-default-value
        models: list,
        X: np.ndarray,
        y: np.ndarray,
        max_evals: int = 400,
        timeout: int = 10 * 60,
        mix_ratios: dict = {"rand": 0.1, "tpe": 0.8, "anneal": 0.1},
        valid_size: float = VALID_SIZE,
    ) -> list:
        """Tune model hyperparameters using hyperopt using a mixed strategy.
        Make sure when using this function that no data leakage happens.
        This data here should be seperate from training and test set.

        Arguments:
            models {list} -- list of models that should be optimized
            X_valid {np.ndarray} -- features
            y_valid {np.ndarray} -- labels
            max_evals {int} -- maximum number of evaluations of hyperparameter optimizations
            timeout {int} -- timeout in seconds after which the optimization stops
            mix_ratios {dict} -- dictionary which provides the ratios of the  different optimization algorithms
            valid_size {float} -- fraction of the last part of the training set used for validation
        Returns:
            list -- list of tuples (name, model) of optimized models
        """

        assert sum(list(mix_ratios.values())) == 1
        assert list(mix_ratios.keys()) == ["rand", "tpe", "anneal"]

        trainlogger.debug("performing hyperparameter optimization")

        optimized_models = []

        mix_algo = partial(
            mix.suggest,
            p_suggest=[
                (mix_ratios["rand"], rand.suggest),
                (mix_ratios["tpe"], tpe.suggest),
                (mix_ratios["anneal"], anneal.suggest),
            ],
        )

        for name, classifier in models:
            m = hyperopt_estimator(
                classifier=classifier("classifier"),
                algo=mix_algo,
                trial_timeout=timeout,
                preprocessing=[],
                max_evals=max_evals,
                seed=RANDOM_SEED,
                # n_jobs=-1, # todo fix installation to use my forks
            )

            m.fit(
                X, y, valid_size=valid_size, cv_shuffle=False
            )  # avoid shuffleing to have the same validation set for the ensemble stage

            # chose the model with best hyperparameters and train it

            n_train = int(len(y) * (1 - valid_size))
            X_train = X[:n_train]
            y_train = y[:n_train]

            m.retrain_best_model_on_full_data(X_train, y_train)

            m = m.best_model()["learner"]

            optimized_models.append((name, m))

        return optimized_models