示例#1
0
def create_random_pipeline():

    pipeline_space = {'clf': any_classifier('my_clf'), 'preprocessor': any_preprocessing('my_prep')}

    sample = hyperopt.pyll.stochastic.sample(pipeline_space)

    classifier = sample['clf']
    p = None
    try:
        preprocessor = sample['preprocessor'][0]
        p = Pipeline([('preprocessing', preprocessor), ('classifier', classifier)])
    except:
        p = Pipeline([('classifier', classifier)])



    return p
示例#2
0
def main():
	# Construct the argument parser and parse the arguments
	ap = argparse.ArgumentParser()
	ap.add_argument("-p", "--path", default='nailgun', help="path to nailgun folder")
	ap.add_argument("-m", "--model", required= True, help="name of the model file to save the model")
	ap.add_argument("-cs", "--csize", default=80, help="paramter to crop the image around the nailgun")
	ap.add_argument("-ex", "--ext", type=str, default='.jpeg', help="extension of the images")
	args = vars(ap.parse_args())

	# Load paramters
	crop_size = args['csize']
	path_to_images = args['path']
	filename = args['model']
	ext = args['ext']

	split_factor = 0.75

	# List all of the images
	paths, labels = list_images(path_to_images, ext)

	# Get paths correctly distibuted good/bad
	n_paths = distribute_paths(paths)

	# Split and generate labels
	(x_train_paths, y_train_str), (x_test_paths, y_test_str) = split_and_get_labels(n_paths, split_factor)

	print('--- Split ---')
	print('Train: '+str(len(x_train_paths))+', Test: '+str(len(x_test_paths)))

	# Load object for label binarizer
	lb = LabelBinarizer()
	lb.fit(y_train_str)	
	
	n_feats = crop_size**2 + 2
	x_train = np.zeros((len(x_train_paths), n_feats), np.uint8)
	y_train = np.zeros((len(y_train_str), 1), np.int32)

	print('---- Extracting Train samples ----')
	progress = tqdm.tqdm(total=len(x_train_paths))

	for idx, path in enumerate(x_train_paths):
		x_train[idx, :] = extract_nail(path)
		y_train[idx] = lb.transform([path.split("_")[-1].split(".")[0]])
		progress.update(1)

	y_train = np.ravel(y_train)

	print('---- Extracting Test samples ----')
	progress = tqdm.tqdm(total=len(x_test_paths))

	x_test = np.zeros((len(x_test_paths), n_feats), np.float)
	y_test = np.zeros((len(y_test_str), 1), np.int32)
	for idx, path in enumerate(x_test_paths):
		x_test[idx, :] = extract_nail(path)
		y_test[idx] = lb.transform([path.split("_")[-1].split(".")[0]])
		progress.update(1)

	y_test = np.ravel(y_test)

	# Define HyperoptEstimator
	estim = HyperoptEstimator(classifier=any_classifier('clf'), preprocessing=any_preprocessing('pp'), algo=tpe.suggest, trial_timeout=30)
	estim.fit(x_train, y_train)

	print('---- BEST SCORE (acc) ----')
	print( estim.score( x_test, y_test ) )

	print('---- BEST MODEL ----')
	print( estim.best_model() )

	pkl_filename = 'model/'+filename+'.pkl'
	with open(pkl_filename, 'wb') as file:
		pickle.dump(estim.best_model(), file)

	print('--- Correctly saved! ---')
示例#3
0
                'status': STATUS_OK,
                'training_time': training_time,
                'total_time': total_time
            }
        except:
            total_time = time.time() - start_time
            return {
                'loss': np.inf,
                'status': STATUS_OK,
                'training_time': 0,
                'total_time': total_time
            }

    pipeline_space = {
        'clf': any_classifier('my_clf'),
        'preprocessor': any_preprocessing('my_prep')
    }

    print(pipeline_space)

    trials = Trials()
    best = fmin(objective,
                space=pipeline_space,
                algo=tpe.suggest,
                max_evals=200,
                trials=trials)

    print(trials.best_trial)

    pickle.dump(trials, open("/tmp/trials.p", "wb"))
示例#4
0
def main(data='newsgroups',
         algo='tpe',
         seed=1,
         evals=100,
         clf='any',
         loss=None,
         pre='any',
         text=''):
    filename = text + algo + '_' + clf + '_' + pre + '_' + str(seed) + '_' + str(evals) + \
               '_' + data

    if loss is not None:
        if hasattr(metrics, loss):
            loss = getattr(metrics, loss)
        else:
            print('Unknown loss metric specified')
            return 1

    if algo == 'tpe':
        algorithm = tpe.suggest
    elif algo == 'anneal':
        algorithm = anneal.suggest
    elif algo == 'rand':
        algorithm = rand.suggest
    elif algo == 'tree':
        algorithm = hypertree.tree.suggest
    elif algo == 'gp_tree':
        algorithm = hypertree.gp_tree.suggest
    else:
        print('Unknown algorithm specified')
        return 1

    # TODO: impose restrictions on classifiers that do not work on sparse data
    if clf == 'any':
        if data in ['newsgroups']:
            classifier = any_sparse_classifier('clf')
        else:
            classifier = any_classifier('clf')
    elif clf == 'knn':
        if data in ['newsgroups']:
            classifier = knn('clf', sparse_data=True)
        else:
            classifier = knn('clf')
    elif clf == 'nearest_centroid':
        if data in ['newsgroups']:
            classifier = nearest_centroid('clf', sparse_data=True)
        else:
            classifier = nearest_centroid('clf')
    elif hasattr(hpsklearn.components, clf):
        classifier = getattr(hpsklearn.components, clf)('clf')
    else:
        print('Unknown classifier specified')
        return 1
    """
  elif clf == 'svc':
    classifier = svc('clf') 
  elif clf == 'knn':
    if data in ['newsgroups']:
      classifier = knn('clf', sparse_data=True) 
    else:
      classifier = knn('clf') 
  elif clf == 'sgd':
    classifier = sgd('clf') 
  elif clf == 'random_forest':
    classifier = random_forest('clf') 
  elif clf == 'extra_trees':
    classifier = extra_trees('clf') 
  elif clf == 'liblinear_svc':
    classifier = liblinear_svc('clf') 
  elif clf == 'multinomial_nb':
    classifier = multinomial_nb('clf') 
  elif clf == 'nearest_centroid':
    if data in ['newsgroups']:
      classifier = nearest_centroid('clf', sparse_data=True) 
    else:
      classifier = nearest_centroid('clf') 
  elif clf == 'rbm':
    classifier = rbm('clf') 
  elif clf == 'colkmeans':
    classifier = colkmeans('clf') 
  else:
    print( 'Unknown classifier specified' )
    return 1
  """

    if pre == 'any':
        if data in ['newsgroups']:
            preproc = any_text_preprocessing('pre')
        else:
            preproc = any_preprocessing('pre')
    elif pre == 'none':
        preproc = []
    elif hasattr(hpsklearn.components, pre):
        preproc = [getattr(hpsklearn.components, pre)('pre')]
    else:
        print('Unknown preprocessing specified')
        return 1
    """
  elif pre == 'pca':
    preproc = [pca('pre')]
  elif pre == 'standard_scaler':
    preproc = [standard_scaler('pre')]
  elif pre == 'min_max_scaler':
    preproc = [min_max_scaler('pre')]
  elif pre == 'normalizer':
    preproc = [normalizer('pre')]
  elif pre == 'tfidf':
    preproc = [tfidf('pre')]
  """

    if data == 'newsgroups':
        sklearn_newsgroups(classifier=classifier,
                           algorithm=algorithm,
                           max_evals=evals,
                           seed=seed,
                           filename=filename,
                           preproc=preproc,
                           loss=loss)
    elif data == 'convex':
        if CONVEX_EXISTS:
            sklearn_convex(classifier=classifier,
                           algorithm=algorithm,
                           max_evals=evals,
                           seed=seed,
                           filename=filename,
                           preproc=preproc,
                           loss=loss)
        else:
            print(
                "Convex dataset not detected on your system, install from MLPython"
            )
            return 1
    elif data == 'mnist':
        sklearn_mnist(classifier=classifier,
                      algorithm=algorithm,
                      max_evals=evals,
                      seed=seed,
                      filename=filename,
                      preproc=preproc,
                      loss=loss)
    elif data == 'digits':
        sklearn_digits(classifier=classifier,
                       algorithm=algorithm,
                       max_evals=evals,
                       seed=seed,
                       filename=filename,
                       preproc=preproc,
                       loss=loss)
    else:
        print("Unknown dataset specified")
def main( data='newsgroups', algo='tpe', seed=1, evals=100, clf='any',
          loss=None, pre='any', text='' ):
  filename = text + algo + '_' + clf + '_' + pre + '_' + str(seed) + '_' + str(evals) + \
             '_' + data
  
  if loss is not None:
    if hasattr( metrics, loss ):
      loss = getattr( metrics, loss )
    else:
      print( 'Unknown loss metric specified' )
      return 1

  if algo == 'tpe':
    algorithm = tpe.suggest
  elif algo == 'anneal':
    algorithm = anneal.suggest
  elif algo == 'rand':
    algorithm = rand.suggest
  elif algo == 'tree':
    algorithm = hypertree.tree.suggest
  elif algo == 'gp_tree':
    algorithm = hypertree.gp_tree.suggest
  else:
    print( 'Unknown algorithm specified' )
    return 1
  
  # TODO: impose restrictions on classifiers that do not work on sparse data
  if clf == 'any':
    if data in ['newsgroups']:
      classifier = any_sparse_classifier('clf') 
    else:
      classifier = any_classifier('clf')
  elif clf == 'knn':
    if data in ['newsgroups']:
      classifier = knn('clf', sparse_data=True) 
    else:
      classifier = knn('clf') 
  elif clf == 'nearest_centroid':
    if data in ['newsgroups']:
      classifier = nearest_centroid('clf', sparse_data=True) 
    else:
      classifier = nearest_centroid('clf') 
  elif hasattr( hpsklearn.components, clf ):
    classifier = getattr( hpsklearn.components, clf )( 'clf' )
  else:
    print( 'Unknown classifier specified' )
    return 1
  """
  elif clf == 'svc':
    classifier = svc('clf') 
  elif clf == 'knn':
    if data in ['newsgroups']:
      classifier = knn('clf', sparse_data=True) 
    else:
      classifier = knn('clf') 
  elif clf == 'sgd':
    classifier = sgd('clf') 
  elif clf == 'random_forest':
    classifier = random_forest('clf') 
  elif clf == 'extra_trees':
    classifier = extra_trees('clf') 
  elif clf == 'liblinear_svc':
    classifier = liblinear_svc('clf') 
  elif clf == 'multinomial_nb':
    classifier = multinomial_nb('clf') 
  elif clf == 'nearest_centroid':
    if data in ['newsgroups']:
      classifier = nearest_centroid('clf', sparse_data=True) 
    else:
      classifier = nearest_centroid('clf') 
  elif clf == 'rbm':
    classifier = rbm('clf') 
  elif clf == 'colkmeans':
    classifier = colkmeans('clf') 
  else:
    print( 'Unknown classifier specified' )
    return 1
  """

  if pre == 'any':
    if data in ['newsgroups']:
      preproc = any_text_preprocessing('pre')
    else:
      preproc = any_preprocessing('pre')
  elif pre == 'none':
    preproc = []
  elif hasattr( hpsklearn.components, pre ):
    preproc = [getattr( hpsklearn.components, pre)( 'pre' )]
  else:
    print( 'Unknown preprocessing specified' )
    return 1
  """
  elif pre == 'pca':
    preproc = [pca('pre')]
  elif pre == 'standard_scaler':
    preproc = [standard_scaler('pre')]
  elif pre == 'min_max_scaler':
    preproc = [min_max_scaler('pre')]
  elif pre == 'normalizer':
    preproc = [normalizer('pre')]
  elif pre == 'tfidf':
    preproc = [tfidf('pre')]
  """

  if data == 'newsgroups':
    sklearn_newsgroups( classifier=classifier, algorithm=algorithm, 
                        max_evals=evals, seed=seed, filename=filename,
                        preproc=preproc, loss=loss )
  elif data == 'convex':
    if CONVEX_EXISTS:
      sklearn_convex( classifier=classifier, algorithm=algorithm, 
                      max_evals=evals, seed=seed, filename=filename,
                      preproc=preproc, loss=loss )
    else:
      print("Convex dataset not detected on your system, install from MLPython")
      return 1
  elif data == 'mnist':
    sklearn_mnist( classifier=classifier, algorithm=algorithm, 
                   max_evals=evals, seed=seed, filename=filename,
                   preproc=preproc, loss=loss )
  elif data == 'digits':
    sklearn_digits( classifier=classifier, algorithm=algorithm, 
                    max_evals=evals, seed=seed, filename=filename,
                    preproc=preproc, loss=loss )
  else:
    print( "Unknown dataset specified" )