def exercise_2(): #connect to openml api apikey = 'ca2397ea8a2cdd9707ef39d76576e786' connector = APIConnector(apikey=apikey) dataset = connector.download_dataset(44) X, y, attribute_names = dataset.get_dataset( target=dataset.default_target_attribute, return_attribute_names=True) kf = cross_validation.KFold(len(X), n_folds=10, shuffle=False, random_state=0) error = [] error_mean = [] lst = [int(math.pow(2, i)) for i in range(0, 8)] clf = RandomForestClassifier(oob_score=True, max_features="auto", random_state=0) for i in lst: error_mean = [] for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.set_params(n_estimators=i) clf.fit(X_train, y_train) error_mean.append(zero_one_loss(y_test, clf.predict(X_test))) error.append(np.array(error_mean).mean()) #plot plt.style.use('ggplot') plt.plot(lst, error, '#009999', marker='o') plt.xticks(lst) plt.show()
def exercise_1(): #connect to openml api apikey = 'ca2397ea8a2cdd9707ef39d76576e786' connector = APIConnector(apikey=apikey) dataset = connector.download_dataset(44) X, y, attribute_names = dataset.get_dataset( target=dataset.default_target_attribute, return_attribute_names=True) error = [] lst = [int(math.pow(2, i)) for i in range(0, 8)] # lst_2 = [i for i in range(1, 200)] #train the classifier clf = RandomForestClassifier(oob_score=True, max_features="auto", random_state=0) #loop estimator parameter for i in lst: clf.set_params(n_estimators=i) clf.fit(X, y) error.append(1 - clf.oob_score_) #plot plt.style.use('ggplot') plt.scatter(lst, error) plt.xticks(lst) plt.show()
def exercise_2(): #connect to openml api apikey = 'ca2397ea8a2cdd9707ef39d76576e786' connector = APIConnector(apikey=apikey) dataset = connector.download_dataset(44) X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True) kf = cross_validation.KFold(len(X), n_folds=10, shuffle=False, random_state=0) error = [] error_mean = [] lst = [int(math.pow(2, i)) for i in range(0, 8)] clf = RandomForestClassifier(oob_score=True, max_features="auto", random_state=0) for i in lst: error_mean = [] for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.set_params(n_estimators=i) clf.fit(X_train, y_train) error_mean.append( zero_one_loss(y_test, clf.predict(X_test)) ) error.append( np.array(error_mean).mean() ) #plot plt.style.use('ggplot') plt.plot(lst, error, '#009999', marker='o') plt.xticks(lst) plt.show()
def setUp(self): self.cwd = os.getcwd() workdir = os.path.dirname(os.path.abspath(__file__)) self.workdir = os.path.join(workdir, "tmp") try: shutil.rmtree(self.workdir) except: pass os.mkdir(self.workdir) os.chdir(self.workdir) self.cached = True try: apikey = os.environ['OPENMLAPIKEY'] except: apikey = None try: travis = os.environ['TRAVIS'] if apikey is None: raise Exception('Running on travis-ci, but no environment ' 'variable OPENMLAPIKEY found.') except: pass self.connector = APIConnector(cache_directory=self.workdir, apikey=apikey)
def exercise_3(): #connect to openml api apikey = 'ca2397ea8a2cdd9707ef39d76576e786' connector = APIConnector(apikey=apikey) dataset = connector.download_dataset(44) X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True) kf = cross_validation.ShuffleSplit(len(X),n_iter=10, test_size=0.1, train_size=0.9, random_state=0) error = [] error_cart = [] error_mean = [] error_mean_cart = [] clf = RandomForestClassifier(n_estimators=100, oob_score=True, max_features="auto", random_state=0) clf_cart = DecisionTreeClassifier() error_mean = [] for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) clf_cart.fit(X_train, y_train) error_mean.append( roc_auc_score(y_test, clf.predict(X_test)) ) error_mean_cart.append( roc_auc_score(y_test, clf_cart.predict(X_test)) ) error.append( np.array(error_mean).mean() ) error_cart.append( np.array(error_mean_cart).mean() ) print 'Error RandomForest: ', error print 'Error CART: ', error_cart
def test_get_cached_datasets(self): workdir = os.path.dirname(os.path.abspath(__file__)) workdir = os.path.join(workdir, "files") connector = APIConnector(cache_directory=workdir) datasets = connector.get_cached_datasets() self.assertIsInstance(datasets, dict) self.assertEqual(len(datasets), 2) self.assertIsInstance(list(datasets.values())[0], OpenMLDataset)
def exercise(): apikey = 'fbc6d4b7868ce52640f6ec74cf076f48' connector = APIConnector(apikey=apikey) #loading data dataset = connector.download_dataset(59) X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True) # iris = pd.DataFrame(X, columns=attribute_names) clf = svm.SVC(kernel='rbf') # gammapar = [] # for i in range(-15, 16, 1): # gammapar.append(math.pow(2,i)); # param_dist = dict(gamma=gammapar) # print gammapar r = np.logspace(-15, 15, 10, base=2) param_dist = {'gamma': r} rand = GridSearchCV(clf, param_dist, cv=10, scoring="roc_auc") rand.fit(X,y) rand.grid_scores_ rand_mean_scores =[result.mean_validation_score for result in rand.grid_scores_] print rand.best_score_ print rand.best_params_ plt.style.use('ggplot') # x_labels = [i for i in range(31)] # gammapar1 = [] # for i in range(-15, 16, 1): # temp = "2^"+str(i) # gammapar1.append(temp); # plt.plot(x_labels, rand_mean_scores) # plt.xticks(x_labels, gammapar1 ) # plt.xlabel('Gamma') # plt.ylabel('AUC') # plt.show() # x_labels = [i for i in range(10)] gammapar1 = [] for i in range(11): temp = r[i-1] gammapar1.append(temp); # plt.plot(x_labels, rand_mean_scores) # plt.xticks(x_labels, gammapar1 ) # plt.xlabel('Gamma') # plt.ylabel('AUC') # plt.show() print rand_mean_scores print r print x_labels print gammapar1
def load_data(dataset_id): #openml connection home_dir = os.path.expanduser("~") openml_dir = os.path.join(home_dir, "openml") cache_dir = os.path.join(openml_dir, "cache") with open(os.path.join(openml_dir, "apikey.txt"), 'r') as fh: key = fh.readline().rstrip('\n') openml = APIConnector(cache_directory=cache_dir, apikey=key) dataset = openml.download_dataset(dataset_id) # load data into panda dataframe X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True) print("no. of samples :"+str(len(X))) return (X,y,attribute_names)
def test_get_cached_dataset(self): workdir = os.path.dirname(os.path.abspath(__file__)) workdir = os.path.join(workdir, "files") with mock.patch.object(APIConnector, "_perform_api_call") as api_mock: api_mock.return_value = 400, \ """<oml:authenticate xmlns:oml = "http://openml.org/openml"> <oml:session_hash>G9MPPN114ZCZNWW2VN3JE9VF1FMV8Y5FXHUDUL4P</oml:session_hash> <oml:valid_until>2014-08-13 20:01:29</oml:valid_until> <oml:timezone>Europe/Berlin</oml:timezone> </oml:authenticate>""" connector = APIConnector(cache_directory=workdir) dataset = connector.get_cached_dataset(2) self.assertIsInstance(dataset, OpenMLDataset) self.assertTrue(connector._perform_api_call.is_called_once())
def get_dataset(did): home_dir = os.path.expanduser("~") openml_dir = os.path.join(home_dir, ".openml") cache_dir = os.path.join(openml_dir, "cache") with open(os.path.join(openml_dir, "apikey.txt"), 'r') as fh: key = fh.readline().rstrip('\n') fh.close() openml = APIConnector(cache_directory = cache_dir, apikey = key) dataset = openml.download_dataset(did) # print('Data-set name: %s'%dataset.name) # print(dataset.description) data, meta = loadarff(dataset.data_file) target_attribute = dataset.default_target_attribute target_attribute_names = meta[target_attribute][1] X, y, attribute_names = dataset.get_dataset(target = target_attribute, return_attribute_names = True) return X, y, attribute_names, target_attribute_names
def variance_exercise3(): apikey = 'ca2397ea8a2cdd9707ef39d76576e786' connector = APIConnector(apikey=apikey) dataset = connector.download_dataset(44) X, y, attribute_names = dataset.get_dataset( target=dataset.default_target_attribute, return_attribute_names=True) kf = cross_validation.ShuffleSplit(len(X), n_iter=10, test_size=0.1, train_size=0.9, random_state=0) total_variance = [] variance_fold = [] lst = [int(math.pow(2, i)) for i in range(0, 8)] clf = RandomForestClassifier(oob_score=True, max_features="auto", random_state=0) for i in lst: variance_fold = [] clf.set_params(n_estimators=i) for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) predicted_elements = clf.predict(X_test) # for i in range(0, len(y_test)): variance_fold.append(predicted_elements) total_variance.append(np.array(variance_fold).var()) plt.style.use('ggplot') plt.plot(lst, total_variance, '#009999', marker='o') plt.xticks(lst) plt.margins(0.02) plt.xlabel('number of trees') plt.ylabel('Variance') plt.show()
def setUp(self, api_connector_mock): __file__ = inspect.getfile(OpenMLTaskTest) self.directory = os.path.dirname(__file__) self.split_filename = os.path.join(self.directory, "..", "files", "tasks", "datasplits.arff") api_connector_mock.return_value = None self.api_connector = APIConnector() self.task = OpenMLTask(1, "supervised classification", 1, "class", "crossvalidation wth holdout", None, None, None, None, self.api_connector)
def bias_exercise3(): #connect to openml api apikey = 'ca2397ea8a2cdd9707ef39d76576e786' connector = APIConnector(apikey=apikey) dataset = connector.download_dataset(44) X, y, attribute_names = dataset.get_dataset( target=dataset.default_target_attribute, return_attribute_names=True) kf = cross_validation.ShuffleSplit(len(X), n_iter=10, test_size=0.1, train_size=0.9, random_state=0) error = [] error_mean = [] lst = [int(math.pow(2, i)) for i in range(0, 8)] clf = RandomForestClassifier(oob_score=True, max_features="auto", random_state=0) for i in lst: error_mean = [] clf.set_params(n_estimators=i) for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) predicted_elements = clf.predict(X_test) for i in range(0, len(y_test)): error_mean.append((y_test[i] - predicted_elements[i]) * (y_test[i] - predicted_elements[i])) error.append(np.array(error_mean).mean()) plt.style.use('ggplot') plt.plot(lst, error, '#009999', marker='o') plt.xticks(lst) plt.margins(0.02) plt.xlabel('number of trees') plt.ylabel('Bias Squared') plt.show()
def variance_exercise3(): apikey = 'ca2397ea8a2cdd9707ef39d76576e786' connector = APIConnector(apikey=apikey) dataset = connector.download_dataset(44) X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True) kf = cross_validation.ShuffleSplit(len(X),n_iter=10, test_size=0.1, train_size=0.9, random_state=0) total_variance = [] variance_fold = [] lst = [int(math.pow(2, i)) for i in range(0, 8)] clf = RandomForestClassifier(oob_score=True, max_features="auto", random_state=0) for i in lst: variance_fold = [] clf.set_params(n_estimators=i) for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) predicted_elements = clf.predict(X_test) # for i in range(0, len(y_test)): variance_fold.append( predicted_elements ) total_variance.append( np.array(variance_fold).var() ) plt.style.use('ggplot') plt.plot(lst, total_variance, '#009999', marker='o') plt.xticks(lst) plt.margins(0.02) plt.xlabel('number of trees') plt.ylabel('Variance') plt.show()
def exercise_3(): #connect to openml api apikey = 'ca2397ea8a2cdd9707ef39d76576e786' connector = APIConnector(apikey=apikey) dataset = connector.download_dataset(44) X, y, attribute_names = dataset.get_dataset( target=dataset.default_target_attribute, return_attribute_names=True) kf = cross_validation.ShuffleSplit(len(X), n_iter=10, test_size=0.1, train_size=0.9, random_state=0) error = [] error_cart = [] error_mean = [] error_mean_cart = [] clf = RandomForestClassifier(n_estimators=100, oob_score=True, max_features="auto", random_state=0) clf_cart = DecisionTreeClassifier() error_mean = [] for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) clf_cart.fit(X_train, y_train) error_mean.append(roc_auc_score(y_test, clf.predict(X_test))) error_mean_cart.append(roc_auc_score(y_test, clf_cart.predict(X_test))) error.append(np.array(error_mean).mean()) error_cart.append(np.array(error_mean_cart).mean()) print 'Error RandomForest: ', error print 'Error CART: ', error_cart
def bias_exercise3(): #connect to openml api apikey = 'ca2397ea8a2cdd9707ef39d76576e786' connector = APIConnector(apikey=apikey) dataset = connector.download_dataset(44) X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True) kf = cross_validation.ShuffleSplit(len(X),n_iter=10, test_size=0.1, train_size=0.9, random_state=0) error = [] error_mean = [] lst = [int(math.pow(2, i)) for i in range(0, 8)] clf = RandomForestClassifier(oob_score=True, max_features="auto", random_state=0) for i in lst: error_mean = [] clf.set_params(n_estimators=i) for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) predicted_elements = clf.predict(X_test) for i in range(0, len(y_test)): error_mean.append( (y_test[i] - predicted_elements[i])*(y_test[i] - predicted_elements[i]) ) error.append( np.array(error_mean).mean() ) plt.style.use('ggplot') plt.plot(lst, error, '#009999', marker='o') plt.xticks(lst) plt.margins(0.02) plt.xlabel('number of trees') plt.ylabel('Bias Squared') plt.show()
def exercise_1(): #connect to openml api apikey = 'ca2397ea8a2cdd9707ef39d76576e786' connector = APIConnector(apikey=apikey) dataset = connector.download_dataset(44) X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True) error = [] lst = [int(math.pow(2, i)) for i in range(0, 8)] # lst_2 = [i for i in range(1, 200)] #train the classifier clf = RandomForestClassifier(oob_score=True, max_features="auto", random_state=0) #loop estimator parameter for i in lst: clf.set_params(n_estimators=i) clf.fit(X, y) error.append(1 - clf.oob_score_) #plot plt.style.use('ggplot') plt.scatter(lst, error) plt.xticks(lst) plt.show()
def test_get_chached_dataset_description(self): workdir = os.path.dirname(os.path.abspath(__file__)) workdir = os.path.join(workdir, "files") connector = APIConnector(cache_directory=workdir) description = connector._get_cached_dataset_description(2) self.assertIsInstance(description, dict)
from keras.models import Sequential from keras.optimizers import SGD from keras.layers.core import Dense, Activation from openml.apiconnector import APIConnector import numpy as np from pandas import Series, DataFrame import pandas as pd apikey = 'b6da739f426042fa9785167b29887d1a' connector = APIConnector(apikey=apikey) print 0 dataset = connector.download_dataset(554) optimizer = None exception_verbosity = 'high' print 1 columns_names = ['feature_' + str(x) for x in range(0, 784)] columns_names.append('target') print 2 train = dataset.get_dataset() train = pd.DataFrame(train, columns=columns_names) y = train['target'] X = train.iloc[:, :-1] X_train = X.iloc[0:60000].values Y_train = y.iloc[0:60000].values X_test = X.iloc[60000:].values Y_test = y.iloc[60000:].values from keras.utils import np_utils, generic_utils
tree.export_graphviz(clf, out_file=f,feature_names=feature_names, class_names=class_names, filled=True, rounded=True, special_characters=True) command = ["dot", "-Tpng", "dt.dot", "-o", figure_name+".png"] try: subprocess.check_call(command) except: exit("Could not run dot, ie graphviz, to " "produce visualization") #openml connection home_dir = os.path.expanduser("~") openml_dir = os.path.join(home_dir, "openml") cache_dir = os.path.join(openml_dir, "cache") with open(os.path.join(openml_dir, "apikey.txt"), 'r') as fh: key = fh.readline().rstrip('\n') openml = APIConnector(cache_directory=cache_dir, apikey=key) dataset = openml.download_dataset(10) dataset = openml.download_dataset(10) # load data into panda dataframe X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True) lymph = pd.DataFrame(X, columns=attribute_names) lymph['class'] = y print(len(lymph)) # histogram of class variable n, bins, patches = plt.hist(lymph['class'], facecolor='green') plt.xlabel('class')
def load(dataset_id): print 'Loadding data_id %d' % (dataset_id) connector = APIConnector(apikey=apikey) dataset = connector.download_dataset(dataset_id) return dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True)
class TestAPIConnector(unittest.TestCase): """Test the APIConnector Note ---- A config file with the username and password must be present to test the API calls. """ def setUp(self): self.cwd = os.getcwd() workdir = os.path.dirname(os.path.abspath(__file__)) self.workdir = os.path.join(workdir, "tmp") try: shutil.rmtree(self.workdir) except: pass os.mkdir(self.workdir) os.chdir(self.workdir) self.cached = True try: apikey = os.environ['OPENMLAPIKEY'] except: apikey = None try: travis = os.environ['TRAVIS'] if apikey is None: raise Exception('Running on travis-ci, but no environment ' 'variable OPENMLAPIKEY found.') except: pass self.connector = APIConnector(cache_directory=self.workdir, apikey=apikey) def tearDown(self): os.chdir(self.cwd) shutil.rmtree(self.workdir) ############################################################################ # Test administrative stuff @unittest.skip("Not implemented yet.") def test_parse_config(self): raise Exception() ############################################################################ # Test all local stuff def test_get_cached_datasets(self): workdir = os.path.dirname(os.path.abspath(__file__)) workdir = os.path.join(workdir, "files") connector = APIConnector(cache_directory=workdir) datasets = connector.get_cached_datasets() self.assertIsInstance(datasets, dict) self.assertEqual(len(datasets), 2) self.assertIsInstance(list(datasets.values())[0], OpenMLDataset) def test_get_cached_dataset(self): workdir = os.path.dirname(os.path.abspath(__file__)) workdir = os.path.join(workdir, "files") with mock.patch.object(APIConnector, "_perform_api_call") as api_mock: api_mock.return_value = 400, \ """<oml:authenticate xmlns:oml = "http://openml.org/openml"> <oml:session_hash>G9MPPN114ZCZNWW2VN3JE9VF1FMV8Y5FXHUDUL4P</oml:session_hash> <oml:valid_until>2014-08-13 20:01:29</oml:valid_until> <oml:timezone>Europe/Berlin</oml:timezone> </oml:authenticate>""" connector = APIConnector(cache_directory=workdir) dataset = connector.get_cached_dataset(2) self.assertIsInstance(dataset, OpenMLDataset) self.assertTrue(connector._perform_api_call.is_called_once()) def test_get_chached_dataset_description(self): workdir = os.path.dirname(os.path.abspath(__file__)) workdir = os.path.join(workdir, "files") connector = APIConnector(cache_directory=workdir) description = connector._get_cached_dataset_description(2) self.assertIsInstance(description, dict) @unittest.skip("Not implemented yet.") def test_get_cached_tasks(self): raise Exception() @unittest.skip("Not implemented yet.") def test_get_cached_task(self): raise Exception() @unittest.skip("Not implemented yet.") def test_get_cached_splits(self): raise Exception() @unittest.skip("Not implemented yet.") def test_get_cached_split(self): raise Exception() ############################################################################ # Test all remote stuff ############################################################################ # Datasets def test_get_dataset_list(self): # We can only perform a smoke test here because we test on dynamic # data from the internet... datasets = self.connector.get_dataset_list() # 1087 as the number of datasets on openml.org self.assertTrue(len(datasets) >= 1087) for dataset in datasets: self.assertEqual(type(dataset), dict) self.assertGreaterEqual(len(dataset), 2) self.assertIn('did', dataset) self.assertIsInstance(dataset['did'], int) self.assertIn('status', dataset) self.assertTrue(is_string(dataset['status'])) self.assertIn(dataset['status'], ['in_preparation', 'active', 'deactivated']) @unittest.skip("Not implemented yet.") def test_datasets_active(self): raise NotImplementedError() def test_download_datasets(self): dids = [1, 2] datasets = self.connector.download_datasets(dids) self.assertEqual(len(datasets), 2) self.assertTrue(os.path.exists(os.path.join( self.connector.dataset_cache_dir, "1", "description.xml"))) self.assertTrue(os.path.exists(os.path.join( self.connector.dataset_cache_dir, "2", "description.xml"))) self.assertTrue(os.path.exists(os.path.join( self.connector.dataset_cache_dir, "1", "dataset.arff"))) self.assertTrue(os.path.exists(os.path.join( self.connector.dataset_cache_dir, "2", "dataset.arff"))) def test_download_dataset(self): dataset = self.connector.download_dataset(1) self.assertEqual(type(dataset), OpenMLDataset) self.assertEqual(dataset.name, 'anneal') self.assertTrue(os.path.exists(os.path.join( self.connector.dataset_cache_dir, "1", "description.xml"))) self.assertTrue(os.path.exists(os.path.join( self.connector.dataset_cache_dir, "1", "dataset.arff"))) def test_download_rowid(self): # Smoke test which checks that the dataset has the row-id set correctly did = 164 dataset = self.connector.download_dataset(did) self.assertEqual(dataset.row_id_attribute, 'instance') def test_download_dataset_description(self): # Only a smoke test, I don't know exactly how to test the URL # retrieval and "caching" description = self.connector.download_dataset_description(2) self.assertIsInstance(description, dict) def test_download_dataset_features(self): # Only a smoke check features = self.connector.download_dataset_features(2) self.assertIsInstance(features, dict) def test_download_dataset_qualities(self): # Only a smoke check qualities = self.connector.download_dataset_qualities(2) self.assertIsInstance(qualities, dict) ############################################################################ # Tasks def test_get_task_list(self): # We can only perform a smoke test here because we test on dynamic # data from the internet... def check_task(task): self.assertEqual(type(task), dict) self.assertGreaterEqual(len(task), 2) self.assertIn('did', task) self.assertIsInstance(task['did'], int) self.assertIn('status', task) self.assertTrue(is_string(task['status'])) self.assertIn(task['status'], ['in_preparation', 'active', 'deactivated']) tasks = self.connector.get_task_list(task_type_id=1) # 1759 as the number of supervised classification tasks retrieved # openml.org from this call; don't trust the number on openml.org as # it also counts private datasets self.assertGreaterEqual(len(tasks), 1759) for task in tasks: check_task(task) tasks = self.connector.get_task_list(task_type_id=2) self.assertGreaterEqual(len(tasks), 735) for task in tasks: check_task(task) def test_download_task(self): task = self.connector.download_task(1) self.assertTrue(os.path.exists( os.path.join(os.getcwd(), "tasks", "1", "task.xml"))) self.assertTrue(os.path.exists( os.path.join(os.getcwd(), "tasks", "1", "datasplits.arff"))) self.assertTrue(os.path.exists( os.path.join(os.getcwd(), "datasets", "1", "dataset.arff"))) def test_download_split(self): task = self.connector.download_task(1) split = self.connector.download_split(task) self.assertEqual(type(split), OpenMLSplit) self.assertTrue(os.path.exists( os.path.join(os.getcwd(), "tasks", "1", "datasplits.arff"))) ############################################################################ # Runs @unittest.skip('The method which is tested by this function doesnt exist') def test_download_run_list(self): def check_run(run): self.assertIsInstance(run, dict) self.assertEqual(len(run), 6) runs = self.connector.get_runs_list(task_id=1) self.assertGreaterEqual(len(runs), 800) for run in runs: check_run(run) runs = self.connector.get_runs_list(flow_id=1) self.assertGreaterEqual(len(runs), 1) for run in runs: check_run(run) runs = self.connector.get_runs_list(setup_id=1) self.assertGreaterEqual(len(runs), 260) for run in runs: check_run(run) @unittest.skip('The method which is tested by this function doesnt exist') def test_download_run(self): run = self.connector.download_run(473350) self.assertGreaterEqual(len(run.tags), 2) self.assertEqual(len(run.datasets), 1) self.assertGreaterEqual(len(run.files), 2) self.assertGreaterEqual(len(run.evaluations), 18) self.assertEqual(len(run.evaluations['f_measure']), 2) # ########################################################################### # Flows @unittest.skip('The method which is tested by this function doesnt exist') def test_download_flow_list(self): def check_flow(flow): self.assertIsInstance(flow, dict) self.assertEqual(len(flow), 6) flows = self.connector.get_flow_list() self.assertGreaterEqual(len(flows), 1448) for flow in flows: check_flow(flow) def test_upload_dataset(self): dataset = self.connector.download_dataset(3) file_path = os.path.join(self.connector.dataset_cache_dir, "3", "dataset.arff") description = """ <oml:data_set_description xmlns:oml="http://openml.org/openml"> <oml:name>anneal</oml:name> <oml:version>1</oml:version> <oml:description>test</oml:description> <oml:format>ARFF</oml:format> <oml:licence>Public</oml:licence> <oml:default_target_attribute>class</oml:default_target_attribute> <oml:md5_checksum></oml:md5_checksum> </oml:data_set_description> """ return_code, dataset_xml = self.connector.upload_dataset(description, file_path) self.assertEqual(return_code, 200) def test_upload_dataset_with_url(self): description = """ <oml:data_set_description xmlns:oml="http://openml.org/openml"> <oml:name>UploadTestWithURL</oml:name> <oml:version>1</oml:version> <oml:description>test</oml:description> <oml:format>ARFF</oml:format> <oml:url>http://expdb.cs.kuleuven.be/expdb/data/uci/nominal/iris.arff</oml:url> </oml:data_set_description> """ return_code, dataset_xml = self.connector.upload_dataset(description) self.assertEqual(return_code, 200) def test_upload_flow(self): file_path = os.path.join(self.connector.dataset_cache_dir,"uploadflow.txt") file = open(file_path, "w") file.write("Testing upload flow") file.close() description = '''<oml:flow xmlns:oml="http://openml.org/openml"><oml:name>Test</oml:name><oml:description>description</oml:description> </oml:flow>''' return_code, dataset_xml = self.connector.upload_flow(description, file_path) self.assertEqual(return_code, 200) def test_upload_run(self): file = urlopen("http://www.openml.org/data/download/224/weka_generated_predictions1977525485999711307.arff") file_text = file.read() prediction_file_path = os.path.join(self.connector.dataset_cache_dir, "weka_generated_predictions1977525485999711307.arff") with open(prediction_file_path, "wb") as prediction_file: prediction_file.write(file_text) description_text = '''<oml:run xmlns:oml="http://openml.org/openml"><oml:task_id>59</oml:task_id><oml:flow_id>67</oml:flow_id></oml:run>''' description_path = os.path.join(self.connector.dataset_cache_dir, "description.xml") with open(description_path, "w") as description_file: description_file.write(description_text) return_code, dataset_xml = self.connector.upload_run(prediction_file_path, description_path) self.assertEqual(return_code, 200)
class TestAPIConnector(unittest.TestCase): """Test the APIConnector Note ---- A config file with the username and password must be present to test the API calls. """ def setUp(self): self.cwd = os.getcwd() workdir = os.path.dirname(os.path.abspath(__file__)) self.workdir = os.path.join(workdir, "tmp") try: shutil.rmtree(self.workdir) except: pass os.mkdir(self.workdir) os.chdir(self.workdir) self.cached = True try: apikey = os.environ['OPENMLAPIKEY'] except: apikey = None try: travis = os.environ['TRAVIS'] if apikey is None: raise Exception('Running on travis-ci, but no environment ' 'variable OPENMLAPIKEY found.') except: pass self.connector = APIConnector(cache_directory=self.workdir, apikey=apikey) def tearDown(self): os.chdir(self.cwd) shutil.rmtree(self.workdir) ############################################################################ # Test administrative stuff @unittest.skip("Not implemented yet.") def test_parse_config(self): raise Exception() ############################################################################ # Test all local stuff def test_get_cached_datasets(self): workdir = os.path.dirname(os.path.abspath(__file__)) workdir = os.path.join(workdir, "files") connector = APIConnector(cache_directory=workdir) datasets = connector.get_cached_datasets() self.assertIsInstance(datasets, dict) self.assertEqual(len(datasets), 2) self.assertIsInstance(list(datasets.values())[0], OpenMLDataset) def test_get_cached_dataset(self): workdir = os.path.dirname(os.path.abspath(__file__)) workdir = os.path.join(workdir, "files") with mock.patch.object(APIConnector, "_perform_api_call") as api_mock: api_mock.return_value = 400, \ """<oml:authenticate xmlns:oml = "http://openml.org/openml"> <oml:session_hash>G9MPPN114ZCZNWW2VN3JE9VF1FMV8Y5FXHUDUL4P</oml:session_hash> <oml:valid_until>2014-08-13 20:01:29</oml:valid_until> <oml:timezone>Europe/Berlin</oml:timezone> </oml:authenticate>""" connector = APIConnector(cache_directory=workdir) dataset = connector.get_cached_dataset(2) self.assertIsInstance(dataset, OpenMLDataset) self.assertTrue(connector._perform_api_call.is_called_once()) def test_get_chached_dataset_description(self): workdir = os.path.dirname(os.path.abspath(__file__)) workdir = os.path.join(workdir, "files") connector = APIConnector(cache_directory=workdir) description = connector._get_cached_dataset_description(2) self.assertIsInstance(description, dict) @unittest.skip("Not implemented yet.") def test_get_cached_tasks(self): raise Exception() @unittest.skip("Not implemented yet.") def test_get_cached_task(self): raise Exception() @unittest.skip("Not implemented yet.") def test_get_cached_splits(self): raise Exception() @unittest.skip("Not implemented yet.") def test_get_cached_split(self): raise Exception() ############################################################################ # Test all remote stuff ############################################################################ # Datasets def test_get_dataset_list(self): # We can only perform a smoke test here because we test on dynamic # data from the internet... datasets = self.connector.get_dataset_list() # 1087 as the number of datasets on openml.org self.assertTrue(len(datasets) >= 1087) for dataset in datasets: self.assertEqual(type(dataset), dict) self.assertGreaterEqual(len(dataset), 2) self.assertIn('did', dataset) self.assertIsInstance(dataset['did'], int) self.assertIn('status', dataset) self.assertTrue(is_string(dataset['status'])) self.assertIn(dataset['status'], ['in_preparation', 'active', 'deactivated']) @unittest.skip("Not implemented yet.") def test_datasets_active(self): raise NotImplementedError() def test_download_datasets(self): dids = [1, 2] datasets = self.connector.download_datasets(dids) self.assertEqual(len(datasets), 2) self.assertTrue( os.path.exists( os.path.join(self.connector.dataset_cache_dir, "1", "description.xml"))) self.assertTrue( os.path.exists( os.path.join(self.connector.dataset_cache_dir, "2", "description.xml"))) self.assertTrue( os.path.exists( os.path.join(self.connector.dataset_cache_dir, "1", "dataset.arff"))) self.assertTrue( os.path.exists( os.path.join(self.connector.dataset_cache_dir, "2", "dataset.arff"))) def test_download_dataset(self): dataset = self.connector.download_dataset(1) self.assertEqual(type(dataset), OpenMLDataset) self.assertEqual(dataset.name, 'anneal') self.assertTrue( os.path.exists( os.path.join(self.connector.dataset_cache_dir, "1", "description.xml"))) self.assertTrue( os.path.exists( os.path.join(self.connector.dataset_cache_dir, "1", "dataset.arff"))) def test_download_rowid(self): # Smoke test which checks that the dataset has the row-id set correctly did = 164 dataset = self.connector.download_dataset(did) self.assertEqual(dataset.row_id_attribute, 'instance') def test_download_dataset_description(self): # Only a smoke test, I don't know exactly how to test the URL # retrieval and "caching" description = self.connector.download_dataset_description(2) self.assertIsInstance(description, dict) def test_download_dataset_features(self): # Only a smoke check features = self.connector.download_dataset_features(2) self.assertIsInstance(features, dict) def test_download_dataset_qualities(self): # Only a smoke check qualities = self.connector.download_dataset_qualities(2) self.assertIsInstance(qualities, dict) ############################################################################ # Tasks def test_get_task_list(self): # We can only perform a smoke test here because we test on dynamic # data from the internet... def check_task(task): self.assertEqual(type(task), dict) self.assertGreaterEqual(len(task), 2) self.assertIn('did', task) self.assertIsInstance(task['did'], int) self.assertIn('status', task) self.assertTrue(is_string(task['status'])) self.assertIn(task['status'], ['in_preparation', 'active', 'deactivated']) tasks = self.connector.get_task_list(task_type_id=1) # 1759 as the number of supervised classification tasks retrieved # openml.org from this call; don't trust the number on openml.org as # it also counts private datasets self.assertGreaterEqual(len(tasks), 1759) for task in tasks: check_task(task) tasks = self.connector.get_task_list(task_type_id=2) self.assertGreaterEqual(len(tasks), 735) for task in tasks: check_task(task) def test_download_task(self): task = self.connector.download_task(1) self.assertTrue( os.path.exists(os.path.join(os.getcwd(), "tasks", "1", "task.xml"))) self.assertTrue( os.path.exists( os.path.join(os.getcwd(), "tasks", "1", "datasplits.arff"))) self.assertTrue( os.path.exists( os.path.join(os.getcwd(), "datasets", "1", "dataset.arff"))) def test_download_split(self): task = self.connector.download_task(1) split = self.connector.download_split(task) self.assertEqual(type(split), OpenMLSplit) self.assertTrue( os.path.exists( os.path.join(os.getcwd(), "tasks", "1", "datasplits.arff"))) ############################################################################ # Runs @unittest.skip('The method which is tested by this function doesnt exist') def test_download_run_list(self): def check_run(run): self.assertIsInstance(run, dict) self.assertEqual(len(run), 6) runs = self.connector.get_runs_list(task_id=1) self.assertGreaterEqual(len(runs), 800) for run in runs: check_run(run) runs = self.connector.get_runs_list(flow_id=1) self.assertGreaterEqual(len(runs), 1) for run in runs: check_run(run) runs = self.connector.get_runs_list(setup_id=1) self.assertGreaterEqual(len(runs), 260) for run in runs: check_run(run) @unittest.skip('The method which is tested by this function doesnt exist') def test_download_run(self): run = self.connector.download_run(473350) self.assertGreaterEqual(len(run.tags), 2) self.assertEqual(len(run.datasets), 1) self.assertGreaterEqual(len(run.files), 2) self.assertGreaterEqual(len(run.evaluations), 18) self.assertEqual(len(run.evaluations['f_measure']), 2) # ########################################################################### # Flows @unittest.skip('The method which is tested by this function doesnt exist') def test_download_flow_list(self): def check_flow(flow): self.assertIsInstance(flow, dict) self.assertEqual(len(flow), 6) flows = self.connector.get_flow_list() self.assertGreaterEqual(len(flows), 1448) for flow in flows: check_flow(flow) def test_upload_dataset(self): dataset = self.connector.download_dataset(3) file_path = os.path.join(self.connector.dataset_cache_dir, "3", "dataset.arff") description = """ <oml:data_set_description xmlns:oml="http://openml.org/openml"> <oml:name>anneal</oml:name> <oml:version>1</oml:version> <oml:description>test</oml:description> <oml:format>ARFF</oml:format> <oml:licence>Public</oml:licence> <oml:default_target_attribute>class</oml:default_target_attribute> <oml:md5_checksum></oml:md5_checksum> </oml:data_set_description> """ return_code, dataset_xml = self.connector.upload_dataset( description, file_path) self.assertEqual(return_code, 200) def test_upload_dataset_with_url(self): description = """ <oml:data_set_description xmlns:oml="http://openml.org/openml"> <oml:name>UploadTestWithURL</oml:name> <oml:version>1</oml:version> <oml:description>test</oml:description> <oml:format>ARFF</oml:format> <oml:url>http://expdb.cs.kuleuven.be/expdb/data/uci/nominal/iris.arff</oml:url> </oml:data_set_description> """ return_code, dataset_xml = self.connector.upload_dataset(description) self.assertEqual(return_code, 200) def test_upload_flow(self): file_path = os.path.join(self.connector.dataset_cache_dir, "uploadflow.txt") file = open(file_path, "w") file.write("Testing upload flow") file.close() description = '''<oml:flow xmlns:oml="http://openml.org/openml"><oml:name>Test</oml:name><oml:description>description</oml:description> </oml:flow>''' return_code, dataset_xml = self.connector.upload_flow( description, file_path) self.assertEqual(return_code, 200) def test_upload_run(self): file = urlopen( "http://www.openml.org/data/download/224/weka_generated_predictions1977525485999711307.arff" ) file_text = file.read() prediction_file_path = os.path.join( self.connector.dataset_cache_dir, "weka_generated_predictions1977525485999711307.arff") with open(prediction_file_path, "wb") as prediction_file: prediction_file.write(file_text) description_text = '''<oml:run xmlns:oml="http://openml.org/openml"><oml:task_id>59</oml:task_id><oml:flow_id>67</oml:flow_id></oml:run>''' description_path = os.path.join(self.connector.dataset_cache_dir, "description.xml") with open(description_path, "w") as description_file: description_file.write(description_text) return_code, dataset_xml = self.connector.upload_run( prediction_file_path, description_path) self.assertEqual(return_code, 200)
print(__doc__) import numpy as np import matplotlib.pyplot as plt from matplotlib.colors import Normalize from openml.apiconnector import APIConnector import os from sklearn.svm import SVC from sklearn.preprocessing import StandardScaler from sklearn.datasets import load_iris from sklearn.cross_validation import StratifiedShuffleSplit from sklearn.grid_search import GridSearchCV import math apikey = 'fbc6d4b7868ce52640f6ec74cf076f48' connector = APIConnector(apikey=apikey) #loading data dataset = connector.download_dataset(59) # Utility function to move the midpoint of a colormap to be around # the values of interest. class MidpointNormalize(Normalize): def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False): self.midpoint = midpoint Normalize.__init__(self, vmin, vmax, clip) def __call__(self, value, clip=None): x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1] return np.ma.masked_array(np.interp(value, x, y))
from openml.autorun import openml_run from sklearn import ensemble import xmltodict import os """ An example of an automated machine learning experiment using openml_run """ key_file_path = "apikey.txt" with open(key_file_path, 'r') as fh: key = fh.readline() task_id = 59 clf = ensemble.RandomForestClassifier() connector = APIConnector(apikey=key) task = connector.download_task(task_id) prediction_path, description_path = openml_run(task, clf) prediction_abspath = os.path.abspath(prediction_path) description_abspath = os.path.abspath(description_path) return_code, response = connector.upload_run(prediction_abspath, description_abspath) if (return_code == 200): response_dict = xmltodict.parse(response.content) run_id = response_dict['oml:upload_run']['oml:run_id'] print("Uploaded run with id %s" % (run_id))
import numpy as np from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier, export_graphviz from sklearn.metrics import accuracy_score, precision_score, recall_score from sklearn.metrics import fbeta_score, confusion_matrix, roc_curve, get_scorer from subprocess import check_output home_dir = os.path.expanduser("~") openml_dir = os.path.join(home_dir, ".openml") cache_dir = os.path.join(openml_dir, "cache") with open(os.path.join(openml_dir, "apikey.txt"), 'r') as fh: key = fh.readline().rstrip('\n') fh.close() ## load dataset lists openml = APIConnector(cache_directory = cache_dir, apikey = key) # datasets = openml.get_dataset_list() # data = pd.DataFrame(datasets) dataset = openml.download_dataset(10) # print('Data-set name: %s'%dataset.name) # print(dataset.description) data, meta = loadarff(dataset.data_file) target_attribute = dataset.default_target_attribute target_attribute_names = meta[target_attribute][1] X, y, attribute_names = dataset.get_dataset(target = target_attribute, return_attribute_names = True) y_values = np.unique(y) print('y_values%s'%y_values) fig, axes_bar = plt.subplots(1, 1) # plot the distribution of target attribute y_values_counts, bin_edges = np.histogram(y, y_values.size, density = False)
from openml.apiconnector import APIConnector from openml.autorun import openml_run from sklearn import ensemble import xmltodict import os """ An example of an automated machine learning experiment using openml_run """ key_file_path = "apikey.txt" with open(key_file_path, 'r') as fh: key = fh.readline() task_id = 59 clf = ensemble.RandomForestClassifier() connector = APIConnector(apikey = key) task = connector.download_task(task_id) prediction_path, description_path = openml_run(task, clf) prediction_abspath = os.path.abspath(prediction_path) description_abspath = os.path.abspath(description_path) return_code, response = connector.upload_run(prediction_abspath, description_abspath) if(return_code == 200): response_dict = xmltodict.parse(response.content) run_id = response_dict['oml:upload_run']['oml:run_id'] print("Uploaded run with id %s" % (run_id))