Exemplo n.º 1
0
    def _create_estimator_random_classifier(classifier=any_classifier('my_clf'
                                                                      ),
                                            preprocessing=any_preprocessing(
                                                'my_pre'),
                                            max_evals=100,
                                            trial_timeout=120,
                                            seed=None,
                                            algo=tpe.suggest):
        """

        :param classifier:
        :param preprocessing:
        :param max_evals:
        :param trial_timeout:
        :param seed:
        :param algo:
        :return:
        """
        estim = HyperoptEstimator(classifier=classifier,
                                  preprocessing=preprocessing,
                                  algo=algo,
                                  max_evals=max_evals,
                                  trial_timeout=trial_timeout,
                                  ex_preprocs=None,
                                  regressor=None,
                                  space=None,
                                  loss_fn=None,
                                  continuous_loss_fn=False,
                                  verbose=False,
                                  fit_increment=1,
                                  fit_increment_dump_filename=None,
                                  seed=seed,
                                  use_partial_fit=False,
                                  refit=True)
        return estim
Exemplo n.º 2
0
def main():
    for dataset in [
            'DataClass.csv', 'FeatureEnvy.csv', 'GodClass.csv',
            'LongMethod.csv'
    ]:
        sys.stdout = open(f'./hyperopt-log/{dataset}.txt', 'w')
        try:
            print(f'Running {dataset}')
            print('=' * 20)
            data = DataLoader.from_file(f'../../../Dodge/data/smell/{dataset}',
                                        target='SMELLS',
                                        col_start=0,
                                        col_stop=-1)

            a = time.time()
            estim = HyperoptEstimator(classifier=any_classifier('clf'),
                                      preprocessing=any_preprocessing('pre'),
                                      algo=tpe.suggest,
                                      max_evals=30,
                                      loss_fn=loss,
                                      trial_timeout=30)

            estim.fit(data.x_train, data.y_train)
            preds = estim.predict(data.x_test)
            metr = ClassificationMetrics(data.y_test, preds)
            metr.add_metrics(['d2h', 'pd', 'pf'])
            print('perf:', metr.get_metrics()[0])
            print(metr.get_metrics())
            print(estim.best_model())
            b = time.time()

            print('Completed in', b - a, 'seconds.')
        except:
            continue
Exemplo n.º 3
0
def main():
    for dataset in ['pitsA', 'pitsB', 'pitsC', 'pitsD', 'pitsE', 'pitsF']:
        sys.stdout = open(f'./hyperopt-log/{dataset}.txt', 'w')
        for i in range(10):
            try:
                print(f'Running {dataset}')
                print('=' * 20)
                data = TextDataLoader.from_file(
                    f'../../../Dodge/data/textmining/{dataset}.txt')

                a = time.time()
                estim = HyperoptEstimator(
                    classifier=any_classifier('clf'),
                    preprocessing=any_text_preprocessing('pre'),
                    algo=tpe.suggest,
                    max_evals=30,
                    loss_fn=loss,
                    trial_timeout=30)

                estim.fit(data.x_train, data.y_train)
                preds = estim.predict(data.x_test)
                metr = ClassificationMetrics(data.y_test, preds)
                metr.add_metrics(['d2h', 'pd', 'pf'])
                print('perf:', metr.get_metrics()[0])
                print(metr.get_metrics())
                print(estim.best_model())
                b = time.time()

                print('Completed in', b - a, 'seconds.')
            except:
                continue
Exemplo n.º 4
0
    def anySample1():
        # Download the data and split into training and test sets
        iris = load_iris()
        X = iris.data
        y = iris.target

        # train and test 的划分
        test_size = int(0.2 * len(y))
        np.random.seed(13)
        indices = np.random.permutation(len(X))
        X_train = X[ indices[:-test_size]]
        y_train = y[ indices[:-test_size]]
        X_test = X[ indices[-test_size:]]
        y_test = y[ indices[-test_size:]]

        any_preprocessing = None
        # Instantiate a HyperoptEstimator with the search space and number of evaluations
        estim = HyperoptEstimator(classifier=any_classifier('my_clf'),
                                  preprocessing=any_preprocessing('my_pre'),
                                  algo=tpe.suggest,
                                  max_evals=100,
                                  trial_timeout=120)

        # Search the hyperparameter space based on the data
        estim.fit( X_train, y_train )

        # Show the results
        print( estim.score( X_test, y_test ) )
        # 1.0

        print( estim.best_model() )
Exemplo n.º 5
0
def main():
    for dataset in glob.glob('../../../Dodge/data/UCI/*.csv'):
        df = pd.read_csv(dataset)
        target = df.columns[-1]
        sys.stdout = open(f'./hyperopt-log/{dataset.split("/")[-1]}.txt', 'w')
        try:
            print(f'Running {dataset}')
            print('=' * 20)
            data = DataLoader.from_file(dataset,
                                        target=target,
                                        col_start=0,
                                        col_stop=-1)

            a = time.time()
            estim = HyperoptEstimator(classifier=any_classifier('clf'),
                                      preprocessing=any_preprocessing('pre'),
                                      algo=tpe.suggest,
                                      max_evals=30,
                                      loss_fn=loss,
                                      trial_timeout=30)

            estim.fit(data.x_train, data.y_train)
            preds = estim.predict(data.x_test)
            metr = ClassificationMetrics(data.y_test, preds)
            metr.add_metrics(['d2h', 'pd', 'pf'])
            print('perf:', metr.get_metrics()[0])
            print(metr.get_metrics())
            print(estim.best_model())
            b = time.time()

            print('Completed in', b - a, 'seconds.')
        except:
            raise
            continue
Exemplo n.º 6
0
def test_hyperopt():
    # Load data
    featuren = 1406
    dir_key = '1406'
    data_key = '850+556'
    dir_path = dir_path_dict[dir_key]
    data_str = dir_path + data_str_dict[data_key]

    # stdout_path = 'outcome_hyperopt_svc.moreinfo1.txt'
    # print '[INFO]  stdout_path:\t{}'.format(stdout_path)
    # sys.stdout = open(stdout_path, 'w')

    print "[INFO]  params:\tclassifier=svc_linear('mySVC'), algo=tpe.suggest, preprocessing=[standard_scaler('std_scl')]"
    scores = []
    sensis = []
    specis = []
    for i in range(10):
        # Load data
        data_path = data_str.format(i + 1)
        print data_path
        trainset, testset = get_dataset(data_path=data_path, foldi=i + 1, featuren=featuren)
        train_data, train_label = trainset
        test_data, test_label = testset

        # Create the estimator object
        estim = hyperopt_estimator(classifier=any_classifier('mySVC'),
                                   algo=tpe.suggest,
                                   preprocessing=[standard_scaler('std_scl')],
                                   seed=RANDOM_SEED)

        # Search the space of classifiers and preprocessing steps and their
        # respective hyperparameters in sklearn to fit a model to the data
        estim.fit(train_data, train_label)

        # show instances of the best classifier
        model = estim.best_model()
        print model

        # Make a prediction using the optimized model
        prediction = estim.predict(test_data)
        error = np.count_nonzero(prediction - test_label) / test_data.shape[0]
        sensi, speci = my_scores(test_label, prediction)
        print 1 - error, sensi, speci

        # Report the accuracy of the classifier on a given set of data
        score = estim.score(test_data, test_label)
        print score

        scores.append(score)
        sensis.append(sensi)
        specis.append(speci)

    print scores
    print "accur:\t{}\tstd:\t{}".format(np.mean(scores), np.std(scores))
    print "sensi:\t{}".format(np.mean(sensis))
    print "speci:\t{}".format(np.mean(specis))
Exemplo n.º 7
0
    def select_best_model(self, max_evals=100, trial_timeout=120):
        if self.train_x is None or self.test_x is None or self.train_y is None or self.test_y is None:
            self.__train_val_split__()

        estim = HyperoptEstimator(classifier=any_classifier('my_clf'),
                                  preprocessing={},
                                  algo=tpe.suggest,
                                  max_evals=max_evals,
                                  trial_timeout=trial_timeout)

        estim.fit(self.train_x.values, self.train_y.values)
        print(estim.score(self.test_x, self.test_y.values))
        return estim
Exemplo n.º 8
0
def compute_score(X_train, y_train, X_test, y_test, cat_indicator, n_jobs,
                  timeout):
    estim = HyperoptEstimator(classifier=any_classifier('clf'),
                              algo=tpe.suggest,
                              max_evals=60,
                              trial_timeout=timeout / 60)
    best = -1
    try:
        estim.fit(X_train, y_train)
        best = estim.score(X_test, y_test)
        print(estim.best_model())
    except:
        best = -1
    return best
Exemplo n.º 9
0
 def hyper_bot(self):
     """
     print accuracy
     :return: None
     """
     model = HyperoptEstimator(
         classifier=any_classifier("cla"),
         preprocessing=any_preprocessing("pre"),
         algo=tpe.suggest,
         max_evals=20,
         trial_timeout=30,
     )
     model.fit(self.x_train, self.y_train)
     accuracy = model.score(self.x_test, self.x_train)
     print(f"Accuray: {accuracy}")
Exemplo n.º 10
0
def train_hypsklearn(X_train, X_test, y_train, y_test, mtype,
                     common_name_model, problemtype, classes,
                     default_featurenames, transform_model, settings,
                     model_session):

    modelname = common_name_model + '.pickle'
    files = list()

    if mtype in [' classification', 'c']:

        estim = HyperoptEstimator(classifier=any_classifier('my_clf'),
                                  preprocessing=any_preprocessing('my_pre'),
                                  algo=tpe.suggest,
                                  max_evals=100,
                                  trial_timeout=120)

        # Search the hyperparameter space based on the data
        estim.fit(X_train, y_train)

    elif mtype in ['regression', 'r']:

        estim = HyperoptEstimator(classifier=any_regressor('my_clf'),
                                  preprocessing=any_preprocessing('my_pre'),
                                  algo=tpe.suggest,
                                  max_evals=100,
                                  trial_timeout=120)

        # Search the hyperparameter space based on the data

        estim.fit(X_train, y_train)

    # Show the results
    print(estim.score(X_test, y_test))
    print(estim.best_model())
    scores = estim.score(X_test, y_test)
    bestmodel = str(estim.best_model())

    print('saving classifier to disk')
    f = open(modelname, 'wb')
    pickle.dump(estim, f)
    f.close()

    files.append(modelname)
    modeldir = os.getcwd()

    return modelname, modeldir, files
Exemplo n.º 11
0
def main():
    file_dic = {"ivy":     ["ivy-1.4.csv", "ivy-2.0.csv"],
                "lucene":  ["lucene-2.0.csv", "lucene-2.2.csv"],
                "lucene2": ["lucene-2.2.csv", "lucene-2.4.csv"],
                "poi":     ["poi-1.5.csv", "poi-2.5.csv"],
                "poi2": ["poi-2.5.csv", "poi-3.0.csv"],
                "synapse": ["synapse-1.0.csv", "synapse-1.1.csv"],
                "synapse2": ["synapse-1.1.csv", "synapse-1.2.csv"],
                "camel": ["camel-1.2.csv", "camel-1.4.csv"],
                "camel2": ["camel-1.4.csv", "camel-1.6.csv"],
                "xerces": ["xerces-1.2.csv", "xerces-1.3.csv"],
                "jedit": ["jedit-3.2.csv", "jedit-4.0.csv"],
                "jedit2": ["jedit-4.0.csv", "jedit-4.1.csv"],
                "log4j": ["log4j-1.0.csv", "log4j-1.1.csv"],
                "xalan": ["xalan-2.4.csv", "xalan-2.5.csv"]
                }

    for dataset in file_dic:
        sys.stdout = open(f'./hyperopt-log/{dat}.txt', 'w')
        print(f'Running {dat}')
        print('=' * 20)
        data = DataLoader.from_files(
            base_path='./issue_close_time/', files=file_dic[dataset])

        try:
            a = time.time()
            estim = HyperoptEstimator(classifier=any_classifier('clf'),
                                      preprocessing=any_preprocessing(
                'pre'),
                algo=tpe.suggest,
                max_evals=30,
                loss_fn=loss,
                trial_timeout=30)

            estim.fit(data.x_train, data.y_train)
            preds = estim.predict(data.x_test)
            metr = ClassificationMetrics(data.y_test, preds)
            metr.add_metrics(['d2h', 'pd', 'pf'])
            print(metr.get_metrics())
            print(estim.best_model())
            b = time.time()

            print('Completed in', b-a, 'seconds.')
            except:
                continue
Exemplo n.º 12
0
def main():

    directories = [
        "1 day", "7 days", "14 days", "30 days", "90 days", "180 days",
        "365 days"
    ]
    datasets = [
        "camel", "cloudstack", "cocoon", "hadoop", "deeplearning", "hive",
        "node", "ofbiz", "qpid"
    ]

    for dat in datasets:
        for time_ in directories:
            sys.stdout = open(f'./hyperopt-log/{dat}-{time_}.txt', 'w')
            print(f'Running {dat}-{time_}')
            print('=' * 30)
            data = DataLoader.from_file(
                "/Users/ryedida/PycharmProjects/raise-package/issue_close_time/"
                + time_ + "/" + dat + ".csv",
                target="timeOpen",
                col_start=0)

            try:
                a = time.time()
                estim = HyperoptEstimator(
                    classifier=any_classifier('clf'),
                    preprocessing=any_preprocessing('pre'),
                    algo=tpe.suggest,
                    max_evals=30,
                    loss_fn=partial(loss, dat, time_),
                    trial_timeout=30)

                estim.fit(data.x_train, data.y_train)
                preds = estim.predict(data.x_test)
                metr = ClassificationMetrics(data.y_test, preds)
                metr.add_metrics(['d2h', 'pd', 'pf'])
                print(metr.get_metrics())
                print(estim.best_model())
                b = time.time()

                print('Completed in', b - a, 'seconds.')
            except ValueError:
                continue
            except:
                continue
Exemplo n.º 13
0
        in the documentation ( http://hyperopt.github.io/hyperopt-sklearn/ ) 
        returns the error:
        "ConnectionResetError: [Errno 54] Connection reset by peer"

"""

# Download the data and split into training and test sets

digits = fetch_mldata('MNIST original')

X = digits.data
y = digits.target

test_size = int(0.2 * len(y))
np.random.seed(13)
indices = np.random.permutation(len(X))
X_train = X[indices[:-test_size]]
y_train = y[indices[:-test_size]]
X_test = X[indices[-test_size:]]
y_test = y[indices[-test_size:]]

estim = HyperoptEstimator(classifier=any_classifier('clf'),
                          algo=tpe.suggest,
                          trial_timeout=300)

estim.fit(X_train, y_train)

print(estim.score(X_test, y_test))
# <<show score here>>
print(estim.best_model())
# <<show model here>>
# define dataset
X, y = make_classification(n_samples=100,
                           n_features=10,
                           n_informative=5,
                           n_redundant=5,
                           random_state=1)

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=1)

# define search
model = HyperoptEstimator(classifier=any_classifier("cla"),
                          preprocessing=any_preprocessing("pre"),
                          algo=tpe.suggest,
                          max_evals=50,
                          trial_timeout=30)

# perform the search
model.fit(X_train, y_train)

# summarize performance
accuracy = model.score(X_test, y_test)
print(f"Accuracy: {accuracy}")

# summarize the best model
print(model.best_model)
Exemplo n.º 15
0
def run(dataset, config):
    log.info("\n**** Hyperopt-sklearn ****\n")

    is_classification = config.type == 'classification'

    default = lambda: 0
    metrics_to_loss_mapping = dict(
        acc=(default, False),  # lambda y, pred: 1.0 - accuracy_score(y, pred)
        auc=(lambda y, pred: 1.0 - roc_auc_score(y, pred), False),
        f1=(lambda y, pred: 1.0 - f1_score(y, pred), False),
        # logloss=(log_loss, True),
        mae=(mean_absolute_error, False),
        mse=(mean_squared_error, False),
        msle=(mean_squared_log_error, False),
        r2=(default, False),  # lambda y, pred: 1.0 - r2_score(y, pred)
        rmse=(mean_squared_error, False),
    )
    loss_fn, continuous_loss_fn = metrics_to_loss_mapping[
        config.metric] if config.metric in metrics_to_loss_mapping else (None,
                                                                         False)
    if loss_fn is None:
        log.warning("Performance metric %s not supported: defaulting to %s.",
                    config.metric, 'accuracy' if is_classification else 'r2')
    if loss_fn is default:
        loss_fn = None

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }

    log.warning("Ignoring cores constraint of %s cores.", config.cores)
    log.info(
        "Running hyperopt-sklearn with a maximum time of %ss on %s cores, optimizing %s.",
        config.max_runtime_seconds, 'all', config.metric)

    X_train = dataset.train.X_enc
    y_train = dataset.train.y_enc

    if is_classification:
        classifier = any_classifier('clf')
        regressor = None
    else:
        classifier = None
        regressor = any_regressor('rgr')

    estimator = HyperoptEstimator(classifier=classifier,
                                  regressor=regressor,
                                  algo=tpe.suggest,
                                  loss_fn=loss_fn,
                                  continuous_loss_fn=continuous_loss_fn,
                                  trial_timeout=config.max_runtime_seconds,
                                  seed=config.seed,
                                  **training_params)

    with InterruptTimeout(config.max_runtime_seconds * 4 / 3,
                          sig=signal.SIGQUIT):
        with InterruptTimeout(config.max_runtime_seconds,
                              before_interrupt=ft.partial(
                                  kill_proc_tree,
                                  timeout=5,
                                  include_parent=False)):
            with Timer() as training:
                estimator.fit(X_train, y_train)

    log.info('Predicting on the test set.')
    X_test = dataset.test.X_enc
    y_test = dataset.test.y_enc
    predictions = estimator.predict(X_test)

    if is_classification:
        probabilities = "predictions"  # encoding is handled by caller in `__init__.py`
    else:
        probabilities = None

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  truth=y_test,
                  probabilities=probabilities,
                  target_is_encoded=is_classification,
                  models_count=len(estimator.trials),
                  training_duration=training.duration)
Exemplo n.º 16
0
def main():
	# Construct the argument parser and parse the arguments
	ap = argparse.ArgumentParser()
	ap.add_argument("-p", "--path", default='nailgun', help="path to nailgun folder")
	ap.add_argument("-m", "--model", required= True, help="name of the model file to save the model")
	ap.add_argument("-cs", "--csize", default=80, help="paramter to crop the image around the nailgun")
	ap.add_argument("-ex", "--ext", type=str, default='.jpeg', help="extension of the images")
	args = vars(ap.parse_args())

	# Load paramters
	crop_size = args['csize']
	path_to_images = args['path']
	filename = args['model']
	ext = args['ext']

	split_factor = 0.75

	# List all of the images
	paths, labels = list_images(path_to_images, ext)

	# Get paths correctly distibuted good/bad
	n_paths = distribute_paths(paths)

	# Split and generate labels
	(x_train_paths, y_train_str), (x_test_paths, y_test_str) = split_and_get_labels(n_paths, split_factor)

	print('--- Split ---')
	print('Train: '+str(len(x_train_paths))+', Test: '+str(len(x_test_paths)))

	# Load object for label binarizer
	lb = LabelBinarizer()
	lb.fit(y_train_str)	
	
	n_feats = crop_size**2 + 2
	x_train = np.zeros((len(x_train_paths), n_feats), np.uint8)
	y_train = np.zeros((len(y_train_str), 1), np.int32)

	print('---- Extracting Train samples ----')
	progress = tqdm.tqdm(total=len(x_train_paths))

	for idx, path in enumerate(x_train_paths):
		x_train[idx, :] = extract_nail(path)
		y_train[idx] = lb.transform([path.split("_")[-1].split(".")[0]])
		progress.update(1)

	y_train = np.ravel(y_train)

	print('---- Extracting Test samples ----')
	progress = tqdm.tqdm(total=len(x_test_paths))

	x_test = np.zeros((len(x_test_paths), n_feats), np.float)
	y_test = np.zeros((len(y_test_str), 1), np.int32)
	for idx, path in enumerate(x_test_paths):
		x_test[idx, :] = extract_nail(path)
		y_test[idx] = lb.transform([path.split("_")[-1].split(".")[0]])
		progress.update(1)

	y_test = np.ravel(y_test)

	# Define HyperoptEstimator
	estim = HyperoptEstimator(classifier=any_classifier('clf'), preprocessing=any_preprocessing('pp'), algo=tpe.suggest, trial_timeout=30)
	estim.fit(x_train, y_train)

	print('---- BEST SCORE (acc) ----')
	print( estim.score( x_test, y_test ) )

	print('---- BEST MODEL ----')
	print( estim.best_model() )

	pkl_filename = 'model/'+filename+'.pkl'
	with open(pkl_filename, 'wb') as file:
		pickle.dump(estim.best_model(), file)

	print('--- Correctly saved! ---')
Exemplo n.º 17
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** Hyperopt-sklearn ****\n")

    is_classification = config.type == 'classification'

    default = lambda: 0
    metrics_to_loss_mapping = dict(
        acc=(default, False),  # lambda y, pred: 1.0 - accuracy_score(y, pred)
        auc=(lambda y, pred: 1.0 - roc_auc_score(y, pred), False),
        f1=(lambda y, pred: 1.0 - f1_score(y, pred), False),
        # logloss=(log_loss, True),
        mae=(mean_absolute_error, False),
        mse=(mean_squared_error, False),
        msle=(mean_squared_log_error, False),
        r2=(default, False),  # lambda y, pred: 1.0 - r2_score(y, pred)
    )
    loss_fn, continuous_loss_fn = metrics_to_loss_mapping[
        config.metric] if config.metric in metrics_to_loss_mapping else (None,
                                                                         False)
    if loss_fn is None:
        log.warning("Performance metric %s not supported: defaulting to %s.",
                    config.metric, 'accuracy' if is_classification else 'r2')
    if loss_fn is default:
        loss_fn = None

    log.warning("Ignoring cores constraint of %s cores.", config.cores)
    log.info(
        "Running hyperopt-sklearn with a maximum time of %ss on %s cores, optimizing %s.",
        config.max_runtime_seconds, 'all', config.metric)

    X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc)
    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    if is_classification:
        classifier = any_classifier('clf')
        regressor = None
    else:
        classifier = None
        regressor = any_regressor('rgr')

    estimator = HyperoptEstimator(classifier=classifier,
                                  regressor=regressor,
                                  algo=tpe.suggest,
                                  loss_fn=loss_fn,
                                  continuous_loss_fn=continuous_loss_fn,
                                  trial_timeout=config.max_runtime_seconds,
                                  seed=config.seed,
                                  **config.framework_params)

    with InterruptTimeout(config.max_runtime_seconds * 4 / 3,
                          sig=signal.SIGQUIT):
        with InterruptTimeout(config.max_runtime_seconds,
                              before_interrupt=ft.partial(
                                  kill_proc_tree,
                                  timeout=5,
                                  include_parent=False)):
            with Timer() as training:
                estimator.fit(X_train, y_train)

    predictions = estimator.predict(X_test)
    probabilities = Encoder('one-hot', target=False,
                            encoded_type=float).fit_transform(
                                predictions) if is_classification else None

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test,
                             target_is_encoded=True)

    return dict(models_count=len(estimator.trials),
                training_duration=training.duration)
def build_model(dataset, pipeline, experiment, param_grid=None, cv=5, scoring='accuracy', n_jobs='auto', test_size=0.3, use_target=None, expanding_window=False):
    models_dir = './results/{}_{}_{}/models/'.format(dataset, pipeline, experiment)
    reports_dir = './results/{}_{}_{}/reports/'.format(dataset, pipeline, experiment)
    experiment_index_file = './results/{}_{}_{}/index.json'.format(dataset, pipeline, experiment)
    log_file = './results/{}_{}_{}/model_build.log'.format(dataset, pipeline, experiment)
    if ',' in scoring:
        scoring = scoring.split(',')
    # if scoring is precision, make scorer manually to suppress zero_division warnings in case of heavy bias
    if scoring == 'precision':
        scoring = make_scorer(precision_score, zero_division=1)
    os.makedirs(models_dir, exist_ok=True)
    os.makedirs(reports_dir, exist_ok=True)
    # Setup logging
    logger.setup(
        filename=log_file,
        filemode='w',
        root_level=logging.DEBUG,
        log_level=logging.DEBUG,
        logger='build_model'
    )
    index_name = 'index'
    if '.' in dataset:
        splits = dataset.split(".")
        dataset = splits[0]
        index_name = splits[1]
    # Load the dataset index
    dataset_index = load_dataset(dataset, return_index=True, index_name=index_name)
    # Dynamically import the pipeline we want to use for building the model
    p = importlib.import_module('pipelines.' + pipeline)
    experiment_index = {}

    if n_jobs == 'auto':
        n_jobs = os.cpu_count()
    # Load parameter grid argument
    if param_grid == None:
        param_grid = p.PARAMETER_GRID
    elif type(param_grid) is 'str':
        with open(param_grid, 'r') as f:
            param_grid = json.load(f)

    logger.info('Start experiment: {} using {} on {}'.format(experiment, pipeline, dataset))
    for _sym, data in dataset_index.items():
        logger.info('Start processing: {}'.format(_sym))
        features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True)
        targets = pd.read_csv(data['target_csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True)
        current_target = p.TARGET if not use_target else use_target

        # Drop columns whose values are all NaN, as well as rows with ANY nan value, then
        # replace infinity values with nan so that they can later be imputed to a finite value
        features = features.dropna(axis='columns', how='all').dropna().replace([np.inf, -np.inf], np.nan)
        target = targets.loc[features.index][current_target]

        features = features.replace([np.inf, -np.inf], np.nan)
        imputer = SimpleImputer()
        imputer.fit(features.values)
        feat_imp_values = imputer.transform(features.values)
        features = pd.DataFrame(feat_imp_values, index=features.index, columns=features.columns)
        X_train, X_test, y_train, y_test = train_test_split(features.values, target.values, shuffle=False, test_size=test_size)
        # Summarize distribution
        logger.info("Start Hyperopt search")
        if expanding_window:
            cv = TimeSeriesSplit(n_splits=expanding_window)
        #cv = sliding_window_split(X_train, 0.1)
        est = HyperoptEstimator(classifier=any_classifier('my_clf'),
                          preprocessing=any_preprocessing('my_pre'),
                          algo=tpe.suggest,
                          max_evals=100,
                          trial_timeout=120)
        est.fit(X_train, y_train)
        logger.info("End Hyperopt search")

        # Take the fitted ensemble with tuned hyperparameters
        clf = est.best_model()['learner']
        best_score = est.score(X_train, y_train)
        best_params = {}

        # Plot learning curve for the classifier
        #est = p.estimator
        #est.set_params(**best_params)

        _, axes = plt.subplots(3, 3, figsize=(20, 12), dpi=200, constrained_layout=True)
        #plt.tight_layout()
        _train_ax = [ axes[0][0], axes[0][1], axes[0][2] ]
        #plot_learning_curve(est, "{} - Learning curves (Train)".format(_sym), X_train, y_train, axes=_train_ax, cv=cv)

        axes[1][0].set_title("{} - ROC (Train)".format(_sym))
        plot_roc_curve(clf, X_train, y_train, ax=axes[1][0])
        axes[1][1].set_title("{} - Precision/Recall (Train)".format(_sym))
        plot_precision_recall_curve(clf, X_train, y_train, ax=axes[1][1])
        axes[1][2].set_title("{} - Confusion matrix (Train)".format(_sym))
        plot_confusion_matrix(clf, X_train, y_train, cmap='Blues', ax=axes[1][2])

        axes[2][0].set_title("{} - ROC (Test)".format(_sym))
        plot_roc_curve(clf, X_test, y_test, ax=axes[2][0])
        axes[2][1].set_title("{} - Precision/Recall (Test)".format(_sym))
        plot_precision_recall_curve(clf, X_train, y_train, ax=axes[2][1])
        axes[2][2].set_title("{} - Confusion matrix (Test)".format(_sym))
        plot_confusion_matrix(clf, X_test, y_test, cmap='Oranges', ax=axes[2][2])

        curve_path = '{}{}_learning_curve.png'.format(reports_dir, _sym)
        plt.savefig(curve_path)
        plt.close()

        # Test ensemble's performance on training and test sets
        predictions1 = clf.predict(X_train)
        train_report = classification_report(y_train, predictions1, output_dict=True)
        logger.info("Classification report on train set:\n{}".format(classification_report(y_train, predictions1)))
        predictions2 = clf.predict(X_test)
        test_report = classification_report(y_test, predictions2, output_dict=True)
        logger.info("Classification report on test set\n{}".format(classification_report(y_test, predictions2)))

        report = {
            'training_set': {
                'features':X_train.shape[1],
                'records':X_train.shape[0],
                'class_distribution': get_class_distribution(y_train),
                'classification_report': train_report,
                'accuracy': accuracy_score(y_train, predictions1),
                'mse': mean_squared_error(y_train, predictions1),
                'precision': precision_score(y_train, predictions1),
                'recall': recall_score(y_train, predictions1),
                'f1': f1_score(y_train, predictions1),
                'y_true':[y for y in y_train],
                'y_pred':[y for y in predictions1]
            },
            'test_set': {
                'features':X_test.shape[1],
                'records':X_test.shape[0],
                'class_distribution':get_class_distribution(y_test),
                'classification_report': test_report,
                'accuracy': accuracy_score(y_test, predictions2),
                'precision': precision_score(y_test, predictions2),
                'mse': mean_squared_error(y_test, predictions2),
                'recall': recall_score(y_test, predictions2),
                'f1': f1_score(y_test, predictions2),
                'y_true': [y for y in y_test],
                'y_pred': [y for y in predictions2]
            }
        }
        # If the classifier has a feature_importances attribute, save it in the report
        feature_importances = None
        if hasattr(clf, 'feature_importances_'):
            feature_importances = clf.feature_importances_
        elif hasattr(clf, 'named_steps') and hasattr(clf.named_steps, 'c') and hasattr(clf.named_steps.c, 'feature_importances_'):
            feature_importances = clf.named_steps.c.feature_importances_
        if feature_importances is not None:
            importances = {features.columns[i]: v for i, v in enumerate(feature_importances)}
            labeled = {str(k): float(v) for k, v in sorted(importances.items(), key=lambda item: -item[1])}
            report['feature_importances'] = labeled
        if hasattr(clf, 'ranking_'):
            report['feature_rank'] = {features.columns[i]: s for i, s in enumerate(clf.ranking_)}
        if hasattr(clf, 'support_'):
            report['feature_support'] = [features.columns[i] for i, s in enumerate(clf.support_) if s]
        train_dist = ['\t\tClass {}:\t{}\t({}%%)'.format(k, d['count'], d['pct']) for k, d in get_class_distribution(y_train).items()]
        test_dist = ['\t\tClass {}:\t{}\t({}%%)'.format(k, d['count'], d['pct']) for k, d in get_class_distribution(y_test).items()]

        logger.info('Model evaluation: \n'
              '== Training set ==\n'
              '\t # Features: {} | # Records: {}\n '
              '\tClass distribution:\n{}\n'
              '\tAccuracy: {}\n'
              '\tPrecision: {}\n'
              '\tMSE: {}\n' \
              '\tRecall: {}\n' \
              '\tF1: {}\n' \
              '== Test set ==\n'
              '\t # Features: {} | # Records: {}\n '
              '\tClass distribution:\n{}\n'
              '\tAccuracy: {}\n'
              '\tPrecision: {}\n'
              '\tMSE: {}\n' \
              '\tRecall: {}\n' \
              '\tF1: {}\n' \
              .format(X_train.shape[1], X_train.shape[0], '\n'.join(train_dist),
                      report['training_set']['accuracy'], report['training_set']['precision'], report['training_set']['mse'],
                      report['training_set']['recall'], report['training_set']['f1'],
                      X_test.shape[1], X_test.shape[0], '\n'.join(test_dist),
                      report['test_set']['accuracy'], report['test_set']['precision'], report['test_set']['mse'],
                      report['test_set']['recall'], report['test_set']['f1']
                      )
        )

        # Save a pickle dump of the model
        model_path = '{}{}.p'.format(models_dir, _sym)
        with open(model_path, 'wb') as f:
            pickle.dump(clf, f)
        # Save the model's parameters
        params_path = '{}{}_parameters.json'.format(models_dir, _sym)
        with open(params_path, 'w') as f:
            json.dump(best_params, f, indent=4)
        # Save the report for this model
        report_path = '{}{}.json'.format(reports_dir, _sym)
        with open(report_path, 'w') as f:
            json.dump(report, f, indent=4)
        # Update the experiment's index with the new results, and save it
        experiment_index[_sym] = {
            'model':model_path,
            'params':params_path,
            'report':report_path
        }
        with open(experiment_index_file, 'w') as f:
            json.dump(experiment_index, f, indent=4)
        logger.info("--- {} end ---".format(_sym))
    return experiment_index
Exemplo n.º 19
0
import time
import numpy as np
from sklearn.datasets import load_digits
from sklearn.svm import SVC
from hyperopt import tpe
from hpsklearn import HyperoptEstimator, any_classifier
from hpsklearn import svc

digits = load_digits()
X = digits.data
y = digits.target
test_size = int(0.2*len(y))
np.random.seed(0)
indices = np.random.permutation(len(X))
X_train = X[indices[:-test_size]]
y_train = y[indices[:-test_size]]
X_test = X[indices[-test_size:]]
y_test = y[indices[-test_size:]]

estim = HyperoptEstimator(classifier=any_classifier('clf'),algo=tpe.suggest, seed=0)
estim.fit(X_train,y_train)
print(estim.score(X_test,y_test))
print(estim.best_model())
# + [markdown] heading_collapsed=true
# ### Find Best Algorithm with Best Params Using HyperoptEstimator AutoML

# + hidden=true
preproc = hp.choice('myprepros_name',
                    [[min_max_scaler('myprepros_name.norm')],
                     [standard_scaler('myprepros_name.std_scaler')],
                     [
                         min_max_scaler('myprepros_name.norm2'),
                         standard_scaler('myprepros_name.std_scaler2')
                     ]])

# + hidden=true
#with mlflow.start_run():
model = HyperoptEstimator(
    classifier=any_classifier('cla'),
    preprocessing=preproc,  #any_preprocessing('pre'), 
    algo=tpe.suggest,
    max_evals=50,
    trial_timeout=5000)
# perform the search
model.fit(X_train, y_train)
accuracy = model.score(X_test, y_test)

#mlflow.log_params(params)
#mlflow.log_metric('accuracy', accuracy)

# Logging training data
#mlflow.log_artifact(local_path = '../Data/higgs_boson_training.csv')

# Logging training code
Exemplo n.º 21
0
    s = random_forest('clf' + '.random_forest'),
    params_regressor = {
        'regressor': None,
        'preprocessing': None,
        'max_evals': 15,
        'trial_timeout': 100,
        'seed': 1
    }

    params_classifier = {
        'classifier': s,
        'preprocessing': None,
        'max_evals': 15,
        'trial_timeout': 100,
        'seed': 1
    }

    s2 = any_classifier('te')
    print(1)

    # estimator = ModelBuilder.create_estimator(params_regressor)

    # dataset_dict = test_dataset()
    # m = Models(params_classifier, dataset_dict)
    #
    # print(m.fit_and_return(verbose_debug=False))

    print(0)
    # print(create_estimator(test_dataset()))
Exemplo n.º 22
0
            f.write('dataset: {}, Auto-sklearn acc: {} ({}), runtime: {}'.format(dataset, acc_mean, acc_std, runtime))
        filename = "autosklearn_{}.file".format(dataset)
        with open(filename, "wb") as f:
            np.save(f, np.array(acc_all))

### TPE/Hyperopt-sklearn ###
if method == "hpsklearn":
    from hpsklearn import HyperoptEstimator, any_classifier
    if dataset == "all":
        for dataset in datasets_all:
            time_all_models, time_each_model = get_required_time(dataset, n_run)
            hpsklearn_start_time = timeit.default_timer()
            acc_all = []
            for run in range(n_run):
                X_train, y_train, X_test, y_test = test_functions.auto_ml.gen_train_test_data(dataset, seed=run)
                hyperopt = HyperoptEstimator(classifier=any_classifier("clf"), algo=tpe.suggest, max_evals=budget,
                                             preprocessing=[], trial_timeout=time_each_model)
                hyperopt.fit(X_train, y_train)
                y_pred = hyperopt.predict(X_test)
                auc = accuracy_score(y_test, y_pred)
                acc_all.append(auc)
                print("run: {}, acc: {}".format(run, round(auc, 4)))
            hpsklearn_end_time = timeit.default_timer()
            acc_mean = round(np.mean(acc_all), 4)
            acc_std = round(np.std(acc_all) / np.sqrt(n_run), 4)
            runtime = round(hpsklearn_end_time - hpsklearn_start_time, 2)
            print("Hyperopt-sklearn acc: {} ({})".format(acc_mean, acc_std))
            print("Hyperopt-sklearn runtime: {}(s)".format(runtime))
            # save result to file
            with open('hpsklearn_result_{}.txt'.format(dataset), 'w') as f:
                f.write('dataset: {}, Hyperopt-sklearn acc: {} ({}), runtime: {}'.format(dataset, acc_mean, acc_std, runtime))