def create_random_pipeline(): pipeline_space = {'clf': any_classifier('my_clf'), 'preprocessor': any_preprocessing('my_prep')} sample = hyperopt.pyll.stochastic.sample(pipeline_space) classifier = sample['clf'] p = None try: preprocessor = sample['preprocessor'][0] p = Pipeline([('preprocessing', preprocessor), ('classifier', classifier)]) except: p = Pipeline([('classifier', classifier)]) return p
def main(): # Construct the argument parser and parse the arguments ap = argparse.ArgumentParser() ap.add_argument("-p", "--path", default='nailgun', help="path to nailgun folder") ap.add_argument("-m", "--model", required= True, help="name of the model file to save the model") ap.add_argument("-cs", "--csize", default=80, help="paramter to crop the image around the nailgun") ap.add_argument("-ex", "--ext", type=str, default='.jpeg', help="extension of the images") args = vars(ap.parse_args()) # Load paramters crop_size = args['csize'] path_to_images = args['path'] filename = args['model'] ext = args['ext'] split_factor = 0.75 # List all of the images paths, labels = list_images(path_to_images, ext) # Get paths correctly distibuted good/bad n_paths = distribute_paths(paths) # Split and generate labels (x_train_paths, y_train_str), (x_test_paths, y_test_str) = split_and_get_labels(n_paths, split_factor) print('--- Split ---') print('Train: '+str(len(x_train_paths))+', Test: '+str(len(x_test_paths))) # Load object for label binarizer lb = LabelBinarizer() lb.fit(y_train_str) n_feats = crop_size**2 + 2 x_train = np.zeros((len(x_train_paths), n_feats), np.uint8) y_train = np.zeros((len(y_train_str), 1), np.int32) print('---- Extracting Train samples ----') progress = tqdm.tqdm(total=len(x_train_paths)) for idx, path in enumerate(x_train_paths): x_train[idx, :] = extract_nail(path) y_train[idx] = lb.transform([path.split("_")[-1].split(".")[0]]) progress.update(1) y_train = np.ravel(y_train) print('---- Extracting Test samples ----') progress = tqdm.tqdm(total=len(x_test_paths)) x_test = np.zeros((len(x_test_paths), n_feats), np.float) y_test = np.zeros((len(y_test_str), 1), np.int32) for idx, path in enumerate(x_test_paths): x_test[idx, :] = extract_nail(path) y_test[idx] = lb.transform([path.split("_")[-1].split(".")[0]]) progress.update(1) y_test = np.ravel(y_test) # Define HyperoptEstimator estim = HyperoptEstimator(classifier=any_classifier('clf'), preprocessing=any_preprocessing('pp'), algo=tpe.suggest, trial_timeout=30) estim.fit(x_train, y_train) print('---- BEST SCORE (acc) ----') print( estim.score( x_test, y_test ) ) print('---- BEST MODEL ----') print( estim.best_model() ) pkl_filename = 'model/'+filename+'.pkl' with open(pkl_filename, 'wb') as file: pickle.dump(estim.best_model(), file) print('--- Correctly saved! ---')
'status': STATUS_OK, 'training_time': training_time, 'total_time': total_time } except: total_time = time.time() - start_time return { 'loss': np.inf, 'status': STATUS_OK, 'training_time': 0, 'total_time': total_time } pipeline_space = { 'clf': any_classifier('my_clf'), 'preprocessor': any_preprocessing('my_prep') } print(pipeline_space) trials = Trials() best = fmin(objective, space=pipeline_space, algo=tpe.suggest, max_evals=200, trials=trials) print(trials.best_trial) pickle.dump(trials, open("/tmp/trials.p", "wb"))
def main(data='newsgroups', algo='tpe', seed=1, evals=100, clf='any', loss=None, pre='any', text=''): filename = text + algo + '_' + clf + '_' + pre + '_' + str(seed) + '_' + str(evals) + \ '_' + data if loss is not None: if hasattr(metrics, loss): loss = getattr(metrics, loss) else: print('Unknown loss metric specified') return 1 if algo == 'tpe': algorithm = tpe.suggest elif algo == 'anneal': algorithm = anneal.suggest elif algo == 'rand': algorithm = rand.suggest elif algo == 'tree': algorithm = hypertree.tree.suggest elif algo == 'gp_tree': algorithm = hypertree.gp_tree.suggest else: print('Unknown algorithm specified') return 1 # TODO: impose restrictions on classifiers that do not work on sparse data if clf == 'any': if data in ['newsgroups']: classifier = any_sparse_classifier('clf') else: classifier = any_classifier('clf') elif clf == 'knn': if data in ['newsgroups']: classifier = knn('clf', sparse_data=True) else: classifier = knn('clf') elif clf == 'nearest_centroid': if data in ['newsgroups']: classifier = nearest_centroid('clf', sparse_data=True) else: classifier = nearest_centroid('clf') elif hasattr(hpsklearn.components, clf): classifier = getattr(hpsklearn.components, clf)('clf') else: print('Unknown classifier specified') return 1 """ elif clf == 'svc': classifier = svc('clf') elif clf == 'knn': if data in ['newsgroups']: classifier = knn('clf', sparse_data=True) else: classifier = knn('clf') elif clf == 'sgd': classifier = sgd('clf') elif clf == 'random_forest': classifier = random_forest('clf') elif clf == 'extra_trees': classifier = extra_trees('clf') elif clf == 'liblinear_svc': classifier = liblinear_svc('clf') elif clf == 'multinomial_nb': classifier = multinomial_nb('clf') elif clf == 'nearest_centroid': if data in ['newsgroups']: classifier = nearest_centroid('clf', sparse_data=True) else: classifier = nearest_centroid('clf') elif clf == 'rbm': classifier = rbm('clf') elif clf == 'colkmeans': classifier = colkmeans('clf') else: print( 'Unknown classifier specified' ) return 1 """ if pre == 'any': if data in ['newsgroups']: preproc = any_text_preprocessing('pre') else: preproc = any_preprocessing('pre') elif pre == 'none': preproc = [] elif hasattr(hpsklearn.components, pre): preproc = [getattr(hpsklearn.components, pre)('pre')] else: print('Unknown preprocessing specified') return 1 """ elif pre == 'pca': preproc = [pca('pre')] elif pre == 'standard_scaler': preproc = [standard_scaler('pre')] elif pre == 'min_max_scaler': preproc = [min_max_scaler('pre')] elif pre == 'normalizer': preproc = [normalizer('pre')] elif pre == 'tfidf': preproc = [tfidf('pre')] """ if data == 'newsgroups': sklearn_newsgroups(classifier=classifier, algorithm=algorithm, max_evals=evals, seed=seed, filename=filename, preproc=preproc, loss=loss) elif data == 'convex': if CONVEX_EXISTS: sklearn_convex(classifier=classifier, algorithm=algorithm, max_evals=evals, seed=seed, filename=filename, preproc=preproc, loss=loss) else: print( "Convex dataset not detected on your system, install from MLPython" ) return 1 elif data == 'mnist': sklearn_mnist(classifier=classifier, algorithm=algorithm, max_evals=evals, seed=seed, filename=filename, preproc=preproc, loss=loss) elif data == 'digits': sklearn_digits(classifier=classifier, algorithm=algorithm, max_evals=evals, seed=seed, filename=filename, preproc=preproc, loss=loss) else: print("Unknown dataset specified")
def main( data='newsgroups', algo='tpe', seed=1, evals=100, clf='any', loss=None, pre='any', text='' ): filename = text + algo + '_' + clf + '_' + pre + '_' + str(seed) + '_' + str(evals) + \ '_' + data if loss is not None: if hasattr( metrics, loss ): loss = getattr( metrics, loss ) else: print( 'Unknown loss metric specified' ) return 1 if algo == 'tpe': algorithm = tpe.suggest elif algo == 'anneal': algorithm = anneal.suggest elif algo == 'rand': algorithm = rand.suggest elif algo == 'tree': algorithm = hypertree.tree.suggest elif algo == 'gp_tree': algorithm = hypertree.gp_tree.suggest else: print( 'Unknown algorithm specified' ) return 1 # TODO: impose restrictions on classifiers that do not work on sparse data if clf == 'any': if data in ['newsgroups']: classifier = any_sparse_classifier('clf') else: classifier = any_classifier('clf') elif clf == 'knn': if data in ['newsgroups']: classifier = knn('clf', sparse_data=True) else: classifier = knn('clf') elif clf == 'nearest_centroid': if data in ['newsgroups']: classifier = nearest_centroid('clf', sparse_data=True) else: classifier = nearest_centroid('clf') elif hasattr( hpsklearn.components, clf ): classifier = getattr( hpsklearn.components, clf )( 'clf' ) else: print( 'Unknown classifier specified' ) return 1 """ elif clf == 'svc': classifier = svc('clf') elif clf == 'knn': if data in ['newsgroups']: classifier = knn('clf', sparse_data=True) else: classifier = knn('clf') elif clf == 'sgd': classifier = sgd('clf') elif clf == 'random_forest': classifier = random_forest('clf') elif clf == 'extra_trees': classifier = extra_trees('clf') elif clf == 'liblinear_svc': classifier = liblinear_svc('clf') elif clf == 'multinomial_nb': classifier = multinomial_nb('clf') elif clf == 'nearest_centroid': if data in ['newsgroups']: classifier = nearest_centroid('clf', sparse_data=True) else: classifier = nearest_centroid('clf') elif clf == 'rbm': classifier = rbm('clf') elif clf == 'colkmeans': classifier = colkmeans('clf') else: print( 'Unknown classifier specified' ) return 1 """ if pre == 'any': if data in ['newsgroups']: preproc = any_text_preprocessing('pre') else: preproc = any_preprocessing('pre') elif pre == 'none': preproc = [] elif hasattr( hpsklearn.components, pre ): preproc = [getattr( hpsklearn.components, pre)( 'pre' )] else: print( 'Unknown preprocessing specified' ) return 1 """ elif pre == 'pca': preproc = [pca('pre')] elif pre == 'standard_scaler': preproc = [standard_scaler('pre')] elif pre == 'min_max_scaler': preproc = [min_max_scaler('pre')] elif pre == 'normalizer': preproc = [normalizer('pre')] elif pre == 'tfidf': preproc = [tfidf('pre')] """ if data == 'newsgroups': sklearn_newsgroups( classifier=classifier, algorithm=algorithm, max_evals=evals, seed=seed, filename=filename, preproc=preproc, loss=loss ) elif data == 'convex': if CONVEX_EXISTS: sklearn_convex( classifier=classifier, algorithm=algorithm, max_evals=evals, seed=seed, filename=filename, preproc=preproc, loss=loss ) else: print("Convex dataset not detected on your system, install from MLPython") return 1 elif data == 'mnist': sklearn_mnist( classifier=classifier, algorithm=algorithm, max_evals=evals, seed=seed, filename=filename, preproc=preproc, loss=loss ) elif data == 'digits': sklearn_digits( classifier=classifier, algorithm=algorithm, max_evals=evals, seed=seed, filename=filename, preproc=preproc, loss=loss ) else: print( "Unknown dataset specified" )