示例#1
0
def testFunction(data):
    #clf = sklearn.ensemble.forest.RandomForestClassifier(bootstrap:true,weight:null,criterion:"gini",depth:null,features:"auto",nodes:null,decrease:0.0,split:null,leaf:1,split:2,leaf:0.0,estimators:10,jobs:1,score:false,state:6826,verbose:0,start:false)
    #X, y, features = data.get_data(target=data.default_target_attribute, return_attribute_names=True);

    run = oml.runs.get_run(1836360)
    print(run.flow_id)
    #flow = oml.flows.get_flow(4834)
    flow = oml.flows.get_flow(8900)
    #flow = oml.flows.get_flow(8426)
    #flow = oml.flows.get_flow(7650)
    flow = oml.flows.flow_to_sklearn(flow)
    clf = pipeline.Pipeline(steps=[('imputer',
                                    impute.SimpleImputer()), ('estimator',
                                                              flow)])
    flow = flows.sklearn_to_flow(clf)
    print(flow.model)
    taskId = tasks.get_task(55)

    run = runs.run_flow_on_task(taskId, flow, avoid_duplicate_runs=True)

    feval = dict(run.fold_evaluations['predictive_accuracy'][0])
    acc = 0
    for val in feval.values():
        acc += val
    print(acc / 10)
示例#2
0
def challenge():
    ## use dev openml to run
    # Download task, run learner, publish results
    task = tasks.get_task(14951)

    ## clf = BaggingClassifier(SVC(), n_estimators = 128)
    '''
    clf = RandomForestClassifier(n_estimators = 128, class_weight = 'balanced_subsample')
    '''
    '''
    clf = BaggingClassifier(ExtraTreeClassifier(), n_estimators = 20)
    '''
    '''
    param_grid = {'max_depth': np.linspace(1, 15, num = 15, dtype = np.int64),
                  'class_weight': ['balanced', 'balanced_subsample', None],
                  'min_samples_split': np.linspace(1, 15, num = 15, dtype = np.int64),
                  'criterion': ['gini', 'entropy']
                  }
    base_clf = RandomForestClassifier(n_estimators = 20)
    clf = GridSearchCV(base_clf, param_grid = param_grid, scoring = 'roc_auc',
                       cv = 10, pre_dispatch = '2*n_jobs', n_jobs = 4)
    '''
    '''
    ## grid search - gamma and C, grid_den = 20, time needed = 13.36s
    grid_den = 1
    param_grid = {#'C': np.logspace(-5, 5, num = grid_den, base = 2.0),
                  'gamma': np.logspace(-5, 5, num = grid_den, base = 2.0)
                  }
    clf = GridSearchCV(SVC(probability = True), param_grid = param_grid, scoring = 'roc_auc',
                       cv = 10, pre_dispatch = '2*n_jobs', n_jobs = 4)
    '''
    clf = KNeighborsClassifier(n_neighbors=5,
                               algorithm='brute',
                               metric='cosine')

    run = runs.run_task(task, clf)
    return_code, response = run.publish()

    # get the run id for reference
    if (return_code == 200):
        response_dict = xmltodict.parse(response)
        run_id = response_dict['oml:upload_run']['oml:run_id']
        print("Uploaded run with id %s. Check it at www.openml.org/r/%s" %
              (run_id, run_id))
示例#3
0
def challenge():    
    ## use dev openml to run
    # Download task, run learner, publish results
    task = tasks.get_task(14951)
    
    ## clf = BaggingClassifier(SVC(), n_estimators = 128)
    
    '''
    clf = RandomForestClassifier(n_estimators = 128, class_weight = 'balanced_subsample')
    '''
    '''
    clf = BaggingClassifier(ExtraTreeClassifier(), n_estimators = 20)
    '''
    '''
    param_grid = {'max_depth': np.linspace(1, 15, num = 15, dtype = np.int64),
                  'class_weight': ['balanced', 'balanced_subsample', None],
                  'min_samples_split': np.linspace(1, 15, num = 15, dtype = np.int64),
                  'criterion': ['gini', 'entropy']
                  }
    base_clf = RandomForestClassifier(n_estimators = 20)
    clf = GridSearchCV(base_clf, param_grid = param_grid, scoring = 'roc_auc',
                       cv = 10, pre_dispatch = '2*n_jobs', n_jobs = 4)
    '''
    '''
    ## grid search - gamma and C, grid_den = 20, time needed = 13.36s
    grid_den = 1
    param_grid = {#'C': np.logspace(-5, 5, num = grid_den, base = 2.0),
                  'gamma': np.logspace(-5, 5, num = grid_den, base = 2.0)
                  }
    clf = GridSearchCV(SVC(probability = True), param_grid = param_grid, scoring = 'roc_auc',
                       cv = 10, pre_dispatch = '2*n_jobs', n_jobs = 4)
    '''
    clf = KNeighborsClassifier(n_neighbors = 5, algorithm = 'brute', metric = 'cosine')
    
    run = runs.run_task(task, clf)
    return_code, response = run.publish()
    
    # get the run id for reference
    if(return_code == 200):
        response_dict = xmltodict.parse(response)
        run_id = response_dict['oml:upload_run']['oml:run_id']
        print("Uploaded run with id %s. Check it at www.openml.org/r/%s" % (run_id,run_id))
示例#4
0
 def get_task(self):
     task = tasks.get_task(self.tid)
     return task
    def test_class_labels(self):

        task = get_task(self.task_id)
        self.assertEqual(task.class_labels,
                         ['tested_negative', 'tested_positive'])
示例#6
0
from openml import tasks, runs
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import xmltodict
from sklearn import ensemble

task = tasks.get_task(14951)
# clf = ensemble.RandomForestClassifier()
# clf = AdaBoostClassifier(algorithm="SAMME.R",n_estimators=700)
# clf = AdaBoostClassifier(algorithm="SAMME",n_estimators=5000)
clf = RandomForestClassifier(
    warm_start=True, n_estimators=128, criterion="entropy", min_samples_split=20, bootstrap=True, random_state=123
)
run = runs.run_task(task, clf)
return_code, response = run.publish()

# get the run id for reference
if return_code == 200:
    response_dict = xmltodict.parse(response)
    run_id = response_dict["oml:upload_run"]["oml:run_id"]
    print("Uploaded     run with id %s. Check it at www.openml.org/r/%s" % (run_id, run_id))
示例#7
0
    def test_download_task(self):

        return get_task(self.task_id)
		if(mean_mutual_information == 0):
			features["NoiseToSignalRatio"] = 0

		features["NoiseToSignalRatio"] = (mean_feature_entropy - mean_mutual_information) / mean_mutual_information

	features["InformationFeatureTime"] = sw.duration
	return features

if __name__ == "__main__":
	utils.log("Running tests - Importing...")
	from openml import datasets, tasks

	# Take 59 is for dataset 61, the iris dataset, which is good for numerical tests,
	# Task 60 is for dataset 62, a zoo dataset, which contains a lot of categorical information.
	task = tasks.get_task(60)
	data = task.get_dataset()
	X, y, categorical = data.get_data(target = data.default_target_attribute, return_categorical_indicator = True)

	# We want to do cross-validation for some landmarkers, so we take a cv-10 fold.
	# We need to unroll the generator into a list because it is iterated over multiple times.
	folds = list(next(task.iterate_repeats()))

	simple = simple_metafeatures(X, y, categorical)
	stats = statistical_metafeatures(X, y, categorical)
	info = information_theoretic_metafeatures(X, y, categorical)
	landmarkers = landmarker_metafeatures(X, y, categorical, folds)

	for key, val in simple.items():
		print("{}: {}".format(key, val))
    def test_get_X_and_Y(self) -> Tuple[np.ndarray, np.ndarray]:

        task = get_task(self.task_id)
        X, Y = task.get_X_and_y()
        return X, Y
示例#10
0
        features["NoiseToSignalRatio"] = (
            mean_feature_entropy -
            mean_mutual_information) / mean_mutual_information

    features["InformationFeatureTime"] = sw.duration
    return features


if __name__ == "__main__":
    utils.log("Running tests - Importing...")
    from openml import datasets, tasks

    # Take 59 is for dataset 61, the iris dataset, which is good for numerical tests,
    # Task 60 is for dataset 62, a zoo dataset, which contains a lot of categorical information.
    task = tasks.get_task(60)
    data = task.get_dataset()
    X, y, categorical = data.get_data(target=data.default_target_attribute,
                                      return_categorical_indicator=True)

    # We want to do cross-validation for some landmarkers, so we take a cv-10 fold.
    # We need to unroll the generator into a list because it is iterated over multiple times.
    folds = list(next(task.iterate_repeats()))

    simple = simple_metafeatures(X, y, categorical)
    stats = statistical_metafeatures(X, y, categorical)
    info = information_theoretic_metafeatures(X, y, categorical)
    landmarkers = landmarker_metafeatures(X, y, categorical, folds)

    for key, val in simple.items():
        print("{}: {}".format(key, val))
示例#11
0
def retrieveTaskId(task):
    return tasks.get_task(getTaskId(task))
	information_names = ",".join(mf.information_theoretic_metafeature_names())
	landmarking_names = ",".join(landmarking_metafeature_names())
	subsample_names = ",".join(subsample_metafeature_names())
	learner_names = ",".join([baselearner.__name__ for baselearner in config.base_learners])
	log(learner_names)
	column_names = "{},{},{},{},{},{},{}\n".format("did,subsize", simple_names, statistical_names, information_names, landmarking_names, subsample_names, learner_names)
	fh.write(column_names)

# Then for each dataset (and every desired subset of it), perform landmarking,
# and record execution time.
for task_id in config.test_task_ids:
	if task_id in config.excluded_tasks.keys():
		continue

	log("Getting task {}".format(task_id))
	task = tasks.get_task(task_id)
	did = task.dataset_id
	log("Loading dataset {}".format(did))

	try:
		dataset = task.get_dataset()
		# Impute the values - While values would be imputed when calculating some meta-features anyway, this gives more control.
		X, y, categorical = dataset.get_data(target = task.target_feature, return_categorical_indicator = True)

		#X, categorical = remove_zero_columns(impute_values(X, categorical), categorical)

		# Subsample landmarker need folds, the train+test set of subsample landmarkers should be 500 instances,
		# since that is the size of our smallest dataset.
		# We first create a fold for 500 stratified samples, and then again divide that selection to 10 folds.
		max_size = 500
		number_of_classes = len(np.unique(y))
示例#13
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar  9 14:06:01 2017

@author: joost
"""

import openml as oml
from openml import tasks, runs

task = tasks.get_task(145677)

X, y = task.get_X_and_y()

from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, Imputer, PolynomialFeatures, MinMaxScaler
from sklearn.feature_selection import GenericUnivariateSelect
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import xgboost as xgb
from sklearn.model_selection import train_test_split
import itertools
import xmltodict