log = logging.getLogger() handler = logging.StreamHandler() formatter = logging.Formatter( '%(asctime)s %(name)-12s %(levelname)-8s %(message)s') handler.setFormatter(formatter) log.addHandler(handler) log.setLevel(logging.DEBUG) ''' Pipeline example using scikit-learn - Scaling features - Training different models and hyperparameter sets - Feature selection using SelectPercentile ''' pip = Pipeline(config, load_yaml('exp.yaml'), workers=10, save=True) @pip.load def load(config): data = {} # data loading train = pd.read_csv('../data/train.csv', index_col='id') data['train_x'] = train.drop('survived', axis=1).values data['train_y'] = train.survived.values test = pd.read_csv('../data/test.csv', index_col='id') data['test_x'] = test.values data['test_ids'] = test.index return data
from dstools.util import load_yaml from dstools.lab import Experiment import pandas as pd ex = Experiment(load_yaml('exp.yaml')['conf']) ex.get(_id=['57435574e0f48c88752991ba']) best = ex.records[0] df = pd.DataFrame(best.test_preds, columns=['PassengerId', 'Survived']) df.set_index('PassengerId', inplace=True) df.Survived = df.Survived.astype(int) df.to_csv('res.csv')
from dstools.util import load_yaml from dstools.sklearn import grid_generator from pipeline.lab.util import top_k from pipeline import SKPipeline # logger configuration log = logging.getLogger() handler = logging.StreamHandler() formatter = logging.Formatter( '%(asctime)s %(name)-12s %(levelname)-8s %(message)s') handler.setFormatter(formatter) log.addHandler(handler) log.setLevel(logging.INFO) # create pipeline object pip = SKPipeline(config, load_yaml('exp.yaml')) # this function should return the all the data used to train models # as a dictionary. In subsequentent functions the data will be available in # the 'data' parameter @pip.load def load(config): iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.30, random_state=0) data = { 'X_train': X_train, 'X_test': X_test,
record['mean_acc'] = scores.mean() # predict on the test set fn = SelectPercentile(f_classif, percentile).fit(train_x, train_y) train_x = fn.transform(train_x) test_x = fn.transform(test_x) scaler = StandardScaler().fit(train_x) train_x = scaler.transform(train_x) test_x = scaler.transform(test_x) model.fit(train_x, train_y) ids = data['test_ids'] preds = model.predict(test_x) record['test_preds'] = [(id_, pred) for id_, pred in zip(ids, preds)] def finalize(config, experiment): experiment.records = top_k(experiment.records, 'mean_acc', 30) experiment['exp_name'] = config['exp_name'] pip = Pipeline(config, load_yaml('exp.yaml'), workers=4, save=True) pip.load = load # pip.model_iterator = model_iterator pip.model_iterator = model_iterator_autosklearn pip.train = train pip.finalize = finalize pip()
# function used to train models, should return # a fitted model def train(config, model, data, record): print record model.fit(data['x_train'], data['y_train']) preds = model.predict(data['X_test']) record['precision'] = precision_score(data['y_test'], preds) return model # optional function used when every model has been trained def finalize(config, experiment): pass # experiment.records = top_k(experiment.records, 'precision', 4) # create pipeline object pip = SKPipeline(config, load_yaml('exp.yaml'), workers=1) # assign your functions pip.load = load pip.model_iterator = model_iterator pip.train = train pip.finalize = finalize # run pipeline pip()