Python APIConnector примеры, openml.apiconnector.APIConnector Python примеры использования

Пример #1

0

Показать файл

Файл: random_forest.py Проект: rodmendozam/Ass3_foundations

def exercise_2():
    #connect to openml api
    apikey = 'ca2397ea8a2cdd9707ef39d76576e786'
    connector = APIConnector(apikey=apikey)
    dataset = connector.download_dataset(44)
    X, y, attribute_names = dataset.get_dataset(
        target=dataset.default_target_attribute, return_attribute_names=True)

    kf = cross_validation.KFold(len(X),
                                n_folds=10,
                                shuffle=False,
                                random_state=0)
    error = []
    error_mean = []
    lst = [int(math.pow(2, i)) for i in range(0, 8)]
    clf = RandomForestClassifier(oob_score=True,
                                 max_features="auto",
                                 random_state=0)
    for i in lst:
        error_mean = []
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            clf.set_params(n_estimators=i)
            clf.fit(X_train, y_train)
            error_mean.append(zero_one_loss(y_test, clf.predict(X_test)))
        error.append(np.array(error_mean).mean())
    #plot
    plt.style.use('ggplot')
    plt.plot(lst, error, '#009999', marker='o')
    plt.xticks(lst)
    plt.show()

Пример #2

0

Показать файл

Файл: random_forest.py Проект: rodmendozam/Ass3_foundations

def exercise_1():
    #connect to openml api
    apikey = 'ca2397ea8a2cdd9707ef39d76576e786'
    connector = APIConnector(apikey=apikey)
    dataset = connector.download_dataset(44)
    X, y, attribute_names = dataset.get_dataset(
        target=dataset.default_target_attribute, return_attribute_names=True)

    error = []
    lst = [int(math.pow(2, i)) for i in range(0, 8)]
    # lst_2 = [i for i in range(1, 200)]
    #train the classifier
    clf = RandomForestClassifier(oob_score=True,
                                 max_features="auto",
                                 random_state=0)
    #loop estimator parameter
    for i in lst:
        clf.set_params(n_estimators=i)
        clf.fit(X, y)
        error.append(1 - clf.oob_score_)
    #plot
    plt.style.use('ggplot')
    plt.scatter(lst, error)
    plt.xticks(lst)
    plt.show()

Пример #3

0

Показать файл

Файл: random_forest.py Проект: rodmendozam/Ass3_foundations

def exercise_2():
    #connect to openml api
    apikey = 'ca2397ea8a2cdd9707ef39d76576e786'
    connector = APIConnector(apikey=apikey)
    dataset = connector.download_dataset(44)
    X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True)

    kf = cross_validation.KFold(len(X), n_folds=10, shuffle=False, random_state=0)
    error = []
    error_mean = []
    lst = [int(math.pow(2, i)) for i in range(0, 8)]
    clf = RandomForestClassifier(oob_score=True,
                                   max_features="auto",
                                   random_state=0)
    for i in lst:
        error_mean = []
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            clf.set_params(n_estimators=i)
            clf.fit(X_train, y_train)
            error_mean.append( zero_one_loss(y_test, clf.predict(X_test)) )
        error.append( np.array(error_mean).mean() )
    #plot
    plt.style.use('ggplot')
    plt.plot(lst, error, '#009999', marker='o')
    plt.xticks(lst)
    plt.show()

Пример #4

0

Показать файл

    def setUp(self):
        self.cwd = os.getcwd()
        workdir = os.path.dirname(os.path.abspath(__file__))
        self.workdir = os.path.join(workdir, "tmp")
        try:
            shutil.rmtree(self.workdir)
        except:
            pass

        os.mkdir(self.workdir)
        os.chdir(self.workdir)

        self.cached = True
        try:
            apikey = os.environ['OPENMLAPIKEY']
        except:
            apikey = None

        try:
            travis = os.environ['TRAVIS']
            if apikey is None:
                raise Exception('Running on travis-ci, but no environment '
                                'variable OPENMLAPIKEY found.')
        except:
            pass

        self.connector = APIConnector(cache_directory=self.workdir,
                                      apikey=apikey)

Пример #5

0

Показать файл

Файл: random_forest.py Проект: rodmendozam/Ass3_foundations

def exercise_3():
    #connect to openml api
    apikey = 'ca2397ea8a2cdd9707ef39d76576e786'
    connector = APIConnector(apikey=apikey)
    dataset = connector.download_dataset(44)
    X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True)


    kf = cross_validation.ShuffleSplit(len(X),n_iter=10, test_size=0.1, train_size=0.9, random_state=0)
    error = []
    error_cart = []
    error_mean = []
    error_mean_cart = []

    clf = RandomForestClassifier(n_estimators=100, oob_score=True,
                                   max_features="auto",
                                   random_state=0)
    clf_cart = DecisionTreeClassifier()
    error_mean = []
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf.fit(X_train, y_train)
        clf_cart.fit(X_train, y_train)

        error_mean.append( roc_auc_score(y_test, clf.predict(X_test)) )
        error_mean_cart.append( roc_auc_score(y_test, clf_cart.predict(X_test)) )

    error.append( np.array(error_mean).mean() )
    error_cart.append( np.array(error_mean_cart).mean() )

    print 'Error RandomForest: ', error
    print 'Error CART: ', error_cart

Пример #6

0

Показать файл

 def test_get_cached_datasets(self):
     workdir = os.path.dirname(os.path.abspath(__file__))
     workdir = os.path.join(workdir, "files")
     connector = APIConnector(cache_directory=workdir)
     datasets = connector.get_cached_datasets()
     self.assertIsInstance(datasets, dict)
     self.assertEqual(len(datasets), 2)
     self.assertIsInstance(list(datasets.values())[0], OpenMLDataset)

Пример #7

0

Показать файл

Файл: test_apiconnector.py Проект: brylie/openml-python

 def test_get_cached_datasets(self):
     workdir = os.path.dirname(os.path.abspath(__file__))
     workdir = os.path.join(workdir, "files")
     connector = APIConnector(cache_directory=workdir)
     datasets = connector.get_cached_datasets()
     self.assertIsInstance(datasets, dict)
     self.assertEqual(len(datasets), 2)
     self.assertIsInstance(list(datasets.values())[0], OpenMLDataset)

Пример #8

0

Показать файл

Файл: ionosphere.py Проект: palindrome6/Data-Mining---Assignment-2

def exercise():
    apikey = 'fbc6d4b7868ce52640f6ec74cf076f48'
    connector = APIConnector(apikey=apikey)
    #loading data
    dataset = connector.download_dataset(59)
    X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True)
    # iris = pd.DataFrame(X, columns=attribute_names)

    clf = svm.SVC(kernel='rbf')
    # gammapar = []
    # for i in range(-15, 16, 1):
    #     gammapar.append(math.pow(2,i));
    # param_dist = dict(gamma=gammapar)
    # print gammapar
    r = np.logspace(-15, 15, 10, base=2)
    param_dist = {'gamma': r}
    rand = GridSearchCV(clf, param_dist, cv=10, scoring="roc_auc")

    rand.fit(X,y)
    rand.grid_scores_
    rand_mean_scores =[result.mean_validation_score for result in rand.grid_scores_]
    print rand.best_score_
    print rand.best_params_

    plt.style.use('ggplot')

    # x_labels = [i for i in range(31)]
    # gammapar1 = []
    # for i in range(-15, 16, 1):
    #     temp = "2^"+str(i)
    #     gammapar1.append(temp);
    # plt.plot(x_labels, rand_mean_scores)
    # plt.xticks(x_labels, gammapar1 )
    # plt.xlabel('Gamma')
    # plt.ylabel('AUC')
    # plt.show()
    #
    x_labels = [i for i in range(10)]
    gammapar1 = []
    for i in range(11):
        temp = r[i-1]
        gammapar1.append(temp);
    # plt.plot(x_labels, rand_mean_scores)
    # plt.xticks(x_labels, gammapar1 )
    # plt.xlabel('Gamma')
    # plt.ylabel('AUC')
    # plt.show()
    print rand_mean_scores
    print r
    print x_labels
    print gammapar1

Пример #9

0

Показать файл

Файл: random_search.py Проект: akash13singh/dm-assignment2

def load_data(dataset_id):
    #openml connection
    home_dir = os.path.expanduser("~")
    openml_dir = os.path.join(home_dir, "openml")
    cache_dir = os.path.join(openml_dir, "cache")
    with open(os.path.join(openml_dir, "apikey.txt"), 'r') as fh:
        key = fh.readline().rstrip('\n')
    openml = APIConnector(cache_directory=cache_dir, apikey=key)
    dataset = openml.download_dataset(dataset_id)
    # load data into panda dataframe
    X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True)

    print("no. of samples :"+str(len(X)))
    return (X,y,attribute_names)

Пример #10

0

Показать файл

    def test_get_cached_dataset(self):
        workdir = os.path.dirname(os.path.abspath(__file__))
        workdir = os.path.join(workdir, "files")

        with mock.patch.object(APIConnector, "_perform_api_call") as api_mock:
            api_mock.return_value = 400, \
                """<oml:authenticate xmlns:oml = "http://openml.org/openml">
                <oml:session_hash>G9MPPN114ZCZNWW2VN3JE9VF1FMV8Y5FXHUDUL4P</oml:session_hash>
                <oml:valid_until>2014-08-13 20:01:29</oml:valid_until>
                <oml:timezone>Europe/Berlin</oml:timezone>
                </oml:authenticate>"""

            connector = APIConnector(cache_directory=workdir)
            dataset = connector.get_cached_dataset(2)
            self.assertIsInstance(dataset, OpenMLDataset)
            self.assertTrue(connector._perform_api_call.is_called_once())

Пример #11

0

Показать файл

Файл: test_apiconnector.py Проект: brylie/openml-python

    def test_get_cached_dataset(self):
        workdir = os.path.dirname(os.path.abspath(__file__))
        workdir = os.path.join(workdir, "files")

        with mock.patch.object(APIConnector, "_perform_api_call") as api_mock:
            api_mock.return_value = 400, \
                """<oml:authenticate xmlns:oml = "http://openml.org/openml">
                <oml:session_hash>G9MPPN114ZCZNWW2VN3JE9VF1FMV8Y5FXHUDUL4P</oml:session_hash>
                <oml:valid_until>2014-08-13 20:01:29</oml:valid_until>
                <oml:timezone>Europe/Berlin</oml:timezone>
                </oml:authenticate>"""

            connector = APIConnector(cache_directory=workdir)
            dataset = connector.get_cached_dataset(2)
            self.assertIsInstance(dataset, OpenMLDataset)
            self.assertTrue(connector._perform_api_call.is_called_once())

Пример #12

0

Показать файл

Файл: test_apiconnector.py Проект: brylie/openml-python

    def setUp(self):
        self.cwd = os.getcwd()
        workdir = os.path.dirname(os.path.abspath(__file__))
        self.workdir = os.path.join(workdir, "tmp")
        try:
            shutil.rmtree(self.workdir)
        except:
            pass

        os.mkdir(self.workdir)
        os.chdir(self.workdir)

        self.cached = True
        try:
            apikey = os.environ['OPENMLAPIKEY']
        except:
            apikey = None

        try:
            travis = os.environ['TRAVIS']
            if apikey is None:
                raise Exception('Running on travis-ci, but no environment '
                                'variable OPENMLAPIKEY found.')
        except:
            pass

        self.connector = APIConnector(cache_directory=self.workdir,
                                      apikey=apikey)

Пример #13

0

Показать файл

Файл: random_forests.py Проект: lidalei/DataMining

def get_dataset(did):
    home_dir = os.path.expanduser("~")
    openml_dir = os.path.join(home_dir, ".openml")
    cache_dir = os.path.join(openml_dir, "cache")
    
    with open(os.path.join(openml_dir, "apikey.txt"), 'r') as fh:
        key = fh.readline().rstrip('\n')
    fh.close()
    
    openml = APIConnector(cache_directory = cache_dir, apikey = key)
    dataset = openml.download_dataset(did)
    # print('Data-set name: %s'%dataset.name)
    # print(dataset.description)
    data, meta = loadarff(dataset.data_file)
    target_attribute = dataset.default_target_attribute
    target_attribute_names = meta[target_attribute][1]
    X, y, attribute_names = dataset.get_dataset(target = target_attribute, return_attribute_names = True)
    
    return X, y, attribute_names, target_attribute_names

Пример #14

0

Показать файл

Файл: nn_mnist.py Проект: ericvui/DataMining-1

def get_dataset(did):
    home_dir = os.path.expanduser("~")
    openml_dir = os.path.join(home_dir, ".openml")
    cache_dir = os.path.join(openml_dir, "cache")
    
    with open(os.path.join(openml_dir, "apikey.txt"), 'r') as fh:
        key = fh.readline().rstrip('\n')
    fh.close()
    
    openml = APIConnector(cache_directory = cache_dir, apikey = key)
    dataset = openml.download_dataset(did)
    # print('Data-set name: %s'%dataset.name)
    # print(dataset.description)
    data, meta = loadarff(dataset.data_file)
    target_attribute = dataset.default_target_attribute
    target_attribute_names = meta[target_attribute][1]
    X, y, attribute_names = dataset.get_dataset(target = target_attribute, return_attribute_names = True)
    
    return X, y, attribute_names, target_attribute_names

Пример #15

0

Показать файл

Файл: random_forest.py Проект: rodmendozam/Ass3_foundations

def variance_exercise3():
    apikey = 'ca2397ea8a2cdd9707ef39d76576e786'
    connector = APIConnector(apikey=apikey)
    dataset = connector.download_dataset(44)
    X, y, attribute_names = dataset.get_dataset(
        target=dataset.default_target_attribute, return_attribute_names=True)

    kf = cross_validation.ShuffleSplit(len(X),
                                       n_iter=10,
                                       test_size=0.1,
                                       train_size=0.9,
                                       random_state=0)
    total_variance = []
    variance_fold = []
    lst = [int(math.pow(2, i)) for i in range(0, 8)]

    clf = RandomForestClassifier(oob_score=True,
                                 max_features="auto",
                                 random_state=0)

    for i in lst:
        variance_fold = []
        clf.set_params(n_estimators=i)
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            clf.fit(X_train, y_train)

            predicted_elements = clf.predict(X_test)

            # for i in range(0, len(y_test)):
            variance_fold.append(predicted_elements)
        total_variance.append(np.array(variance_fold).var())

    plt.style.use('ggplot')
    plt.plot(lst, total_variance, '#009999', marker='o')
    plt.xticks(lst)
    plt.margins(0.02)
    plt.xlabel('number of trees')
    plt.ylabel('Variance')
    plt.show()

Пример #16

0

Показать файл

    def setUp(self, api_connector_mock):
        __file__ = inspect.getfile(OpenMLTaskTest)
        self.directory = os.path.dirname(__file__)
        self.split_filename = os.path.join(self.directory, "..", "files",
                                           "tasks", "datasplits.arff")

        api_connector_mock.return_value = None
        self.api_connector = APIConnector()
        self.task = OpenMLTask(1, "supervised classification", 1, "class",
                               "crossvalidation wth holdout", None, None, None,
                               None, self.api_connector)

Пример #17

0

Показать файл

Файл: random_forest.py Проект: rodmendozam/Ass3_foundations

def bias_exercise3():
    #connect to openml api
    apikey = 'ca2397ea8a2cdd9707ef39d76576e786'
    connector = APIConnector(apikey=apikey)
    dataset = connector.download_dataset(44)
    X, y, attribute_names = dataset.get_dataset(
        target=dataset.default_target_attribute, return_attribute_names=True)

    kf = cross_validation.ShuffleSplit(len(X),
                                       n_iter=10,
                                       test_size=0.1,
                                       train_size=0.9,
                                       random_state=0)
    error = []
    error_mean = []
    lst = [int(math.pow(2, i)) for i in range(0, 8)]

    clf = RandomForestClassifier(oob_score=True,
                                 max_features="auto",
                                 random_state=0)
    for i in lst:
        error_mean = []
        clf.set_params(n_estimators=i)
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            clf.fit(X_train, y_train)
            predicted_elements = clf.predict(X_test)

            for i in range(0, len(y_test)):
                error_mean.append((y_test[i] - predicted_elements[i]) *
                                  (y_test[i] - predicted_elements[i]))
        error.append(np.array(error_mean).mean())

    plt.style.use('ggplot')
    plt.plot(lst, error, '#009999', marker='o')
    plt.xticks(lst)
    plt.margins(0.02)
    plt.xlabel('number of trees')
    plt.ylabel('Bias Squared')
    plt.show()

Пример #18

0

Показать файл

Файл: random_forest.py Проект: rodmendozam/Ass3_foundations

def variance_exercise3():
    apikey = 'ca2397ea8a2cdd9707ef39d76576e786'
    connector = APIConnector(apikey=apikey)
    dataset = connector.download_dataset(44)
    X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True)


    kf = cross_validation.ShuffleSplit(len(X),n_iter=10, test_size=0.1, train_size=0.9, random_state=0)
    total_variance = []
    variance_fold = []
    lst = [int(math.pow(2, i)) for i in range(0, 8)]

    clf = RandomForestClassifier(oob_score=True,
                                   max_features="auto",
                                   random_state=0)


    for i in lst:
        variance_fold = []
        clf.set_params(n_estimators=i)
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            clf.fit(X_train, y_train)


            predicted_elements = clf.predict(X_test)

            # for i in range(0, len(y_test)):
            variance_fold.append( predicted_elements )
        total_variance.append( np.array(variance_fold).var() )

    plt.style.use('ggplot')
    plt.plot(lst, total_variance, '#009999', marker='o')
    plt.xticks(lst)
    plt.margins(0.02)
    plt.xlabel('number of trees')
    plt.ylabel('Variance')
    plt.show()

Пример #19

0

Показать файл

Файл: random_forest.py Проект: rodmendozam/Ass3_foundations

def exercise_3():
    #connect to openml api
    apikey = 'ca2397ea8a2cdd9707ef39d76576e786'
    connector = APIConnector(apikey=apikey)
    dataset = connector.download_dataset(44)
    X, y, attribute_names = dataset.get_dataset(
        target=dataset.default_target_attribute, return_attribute_names=True)

    kf = cross_validation.ShuffleSplit(len(X),
                                       n_iter=10,
                                       test_size=0.1,
                                       train_size=0.9,
                                       random_state=0)
    error = []
    error_cart = []
    error_mean = []
    error_mean_cart = []

    clf = RandomForestClassifier(n_estimators=100,
                                 oob_score=True,
                                 max_features="auto",
                                 random_state=0)
    clf_cart = DecisionTreeClassifier()
    error_mean = []
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf.fit(X_train, y_train)
        clf_cart.fit(X_train, y_train)

        error_mean.append(roc_auc_score(y_test, clf.predict(X_test)))
        error_mean_cart.append(roc_auc_score(y_test, clf_cart.predict(X_test)))

    error.append(np.array(error_mean).mean())
    error_cart.append(np.array(error_mean_cart).mean())

    print 'Error RandomForest: ', error
    print 'Error CART: ', error_cart

Пример #20

0

Показать файл

Файл: random_forest.py Проект: rodmendozam/Ass3_foundations

def bias_exercise3():
    #connect to openml api
    apikey = 'ca2397ea8a2cdd9707ef39d76576e786'
    connector = APIConnector(apikey=apikey)
    dataset = connector.download_dataset(44)
    X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True)


    kf = cross_validation.ShuffleSplit(len(X),n_iter=10, test_size=0.1, train_size=0.9, random_state=0)
    error = []
    error_mean = []
    lst = [int(math.pow(2, i)) for i in range(0, 8)]

    clf = RandomForestClassifier(oob_score=True,
                                   max_features="auto",
                                   random_state=0)
    for i in lst:
        error_mean = []
        clf.set_params(n_estimators=i)
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            clf.fit(X_train, y_train)
            predicted_elements = clf.predict(X_test)

            for i in range(0, len(y_test)):
                error_mean.append( (y_test[i] - predicted_elements[i])*(y_test[i] - predicted_elements[i])  )
        error.append( np.array(error_mean).mean() )

    plt.style.use('ggplot')
    plt.plot(lst, error, '#009999', marker='o')
    plt.xticks(lst)
    plt.margins(0.02)
    plt.xlabel('number of trees')
    plt.ylabel('Bias Squared')
    plt.show()

Пример #21

0

Показать файл

Файл: random_forest.py Проект: rodmendozam/Ass3_foundations

def exercise_1():
    #connect to openml api
    apikey = 'ca2397ea8a2cdd9707ef39d76576e786'
    connector = APIConnector(apikey=apikey)
    dataset = connector.download_dataset(44)
    X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True)

    error = []
    lst = [int(math.pow(2, i)) for i in range(0, 8)]
    # lst_2 = [i for i in range(1, 200)]
    #train the classifier
    clf = RandomForestClassifier(oob_score=True,
                                   max_features="auto",
                                   random_state=0)
    #loop estimator parameter
    for i in lst:
        clf.set_params(n_estimators=i)
        clf.fit(X, y)
        error.append(1 - clf.oob_score_)
    #plot
    plt.style.use('ggplot')
    plt.scatter(lst, error)
    plt.xticks(lst)
    plt.show()

Пример #22

0

Показать файл

Файл: test_apiconnector.py Проект: brylie/openml-python

 def test_get_chached_dataset_description(self):
     workdir = os.path.dirname(os.path.abspath(__file__))
     workdir = os.path.join(workdir, "files")
     connector = APIConnector(cache_directory=workdir)
     description = connector._get_cached_dataset_description(2)
     self.assertIsInstance(description, dict)

Пример #23

0

Показать файл

Файл: neaural.py Проект: MazenAly/ensembles

from keras.models import Sequential
from keras.optimizers import SGD
from keras.layers.core import Dense, Activation
from openml.apiconnector import APIConnector
import numpy as np
from pandas import Series, DataFrame
import pandas as pd

apikey = 'b6da739f426042fa9785167b29887d1a'
connector = APIConnector(apikey=apikey)
print 0
dataset = connector.download_dataset(554)
optimizer = None
exception_verbosity = 'high'

print 1
columns_names = ['feature_' + str(x) for x in range(0, 784)]
columns_names.append('target')
print 2
train = dataset.get_dataset()
train = pd.DataFrame(train, columns=columns_names)
y = train['target']
X = train.iloc[:, :-1]

X_train = X.iloc[0:60000].values
Y_train = y.iloc[0:60000].values
X_test = X.iloc[60000:].values
Y_test = y.iloc[60000:].values

from keras.utils import np_utils, generic_utils

Пример #24

0

Показать файл

Файл: dtree.py Проект: akash13singh/dm-assignment2

        tree.export_graphviz(clf, out_file=f,feature_names=feature_names, class_names=class_names, filled=True, rounded=True,  special_characters=True)
    command = ["dot", "-Tpng", "dt.dot", "-o", figure_name+".png"]
    try:
        subprocess.check_call(command)
    except:
        exit("Could not run dot, ie graphviz, to "
             "produce visualization")


#openml connection
home_dir = os.path.expanduser("~")
openml_dir = os.path.join(home_dir, "openml")
cache_dir = os.path.join(openml_dir, "cache")
with open(os.path.join(openml_dir, "apikey.txt"), 'r') as fh:
    key = fh.readline().rstrip('\n')
openml = APIConnector(cache_directory=cache_dir, apikey=key)
dataset = openml.download_dataset(10)
dataset = openml.download_dataset(10)


# load data into panda dataframe
X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True)
lymph = pd.DataFrame(X, columns=attribute_names)
lymph['class'] = y
print(len(lymph))



# histogram of class variable
n, bins, patches = plt.hist(lymph['class'], facecolor='green')
plt.xlabel('class')

Пример #25

0

Показать файл

Файл: util.py Проект: heytitle/DataMining

def load(dataset_id):
    print 'Loadding data_id %d' % (dataset_id)
    connector = APIConnector(apikey=apikey)
    dataset = connector.download_dataset(dataset_id)
    return dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True)

Пример #26

0

Показать файл

Файл: test_apiconnector.py Проект: brylie/openml-python

class TestAPIConnector(unittest.TestCase):
    """Test the APIConnector

    Note
    ----
    A config file with the username and password must be present to test the
    API calls.
    """

    def setUp(self):
        self.cwd = os.getcwd()
        workdir = os.path.dirname(os.path.abspath(__file__))
        self.workdir = os.path.join(workdir, "tmp")
        try:
            shutil.rmtree(self.workdir)
        except:
            pass

        os.mkdir(self.workdir)
        os.chdir(self.workdir)

        self.cached = True
        try:
            apikey = os.environ['OPENMLAPIKEY']
        except:
            apikey = None

        try:
            travis = os.environ['TRAVIS']
            if apikey is None:
                raise Exception('Running on travis-ci, but no environment '
                                'variable OPENMLAPIKEY found.')
        except:
            pass

        self.connector = APIConnector(cache_directory=self.workdir,
                                      apikey=apikey)

    def tearDown(self):
        os.chdir(self.cwd)
        shutil.rmtree(self.workdir)

    ############################################################################
    # Test administrative stuff
    @unittest.skip("Not implemented yet.")
    def test_parse_config(self):
        raise Exception()

    ############################################################################
    # Test all local stuff
    def test_get_cached_datasets(self):
        workdir = os.path.dirname(os.path.abspath(__file__))
        workdir = os.path.join(workdir, "files")
        connector = APIConnector(cache_directory=workdir)
        datasets = connector.get_cached_datasets()
        self.assertIsInstance(datasets, dict)
        self.assertEqual(len(datasets), 2)
        self.assertIsInstance(list(datasets.values())[0], OpenMLDataset)

    def test_get_cached_dataset(self):
        workdir = os.path.dirname(os.path.abspath(__file__))
        workdir = os.path.join(workdir, "files")

        with mock.patch.object(APIConnector, "_perform_api_call") as api_mock:
            api_mock.return_value = 400, \
                """<oml:authenticate xmlns:oml = "http://openml.org/openml">
                <oml:session_hash>G9MPPN114ZCZNWW2VN3JE9VF1FMV8Y5FXHUDUL4P</oml:session_hash>
                <oml:valid_until>2014-08-13 20:01:29</oml:valid_until>
                <oml:timezone>Europe/Berlin</oml:timezone>
                </oml:authenticate>"""

            connector = APIConnector(cache_directory=workdir)
            dataset = connector.get_cached_dataset(2)
            self.assertIsInstance(dataset, OpenMLDataset)
            self.assertTrue(connector._perform_api_call.is_called_once())

    def test_get_chached_dataset_description(self):
        workdir = os.path.dirname(os.path.abspath(__file__))
        workdir = os.path.join(workdir, "files")
        connector = APIConnector(cache_directory=workdir)
        description = connector._get_cached_dataset_description(2)
        self.assertIsInstance(description, dict)

    @unittest.skip("Not implemented yet.")
    def test_get_cached_tasks(self):
        raise Exception()

    @unittest.skip("Not implemented yet.")
    def test_get_cached_task(self):
        raise Exception()

    @unittest.skip("Not implemented yet.")
    def test_get_cached_splits(self):
        raise Exception()

    @unittest.skip("Not implemented yet.")
    def test_get_cached_split(self):
        raise Exception()

    ############################################################################
    # Test all remote stuff

    ############################################################################
    # Datasets
    def test_get_dataset_list(self):
        # We can only perform a smoke test here because we test on dynamic
        # data from the internet...
        datasets = self.connector.get_dataset_list()
        # 1087 as the number of datasets on openml.org
        self.assertTrue(len(datasets) >= 1087)
        for dataset in datasets:
            self.assertEqual(type(dataset), dict)
            self.assertGreaterEqual(len(dataset), 2)
            self.assertIn('did', dataset)
            self.assertIsInstance(dataset['did'], int)
            self.assertIn('status', dataset)
            self.assertTrue(is_string(dataset['status']))
            self.assertIn(dataset['status'], ['in_preparation', 'active',
                                              'deactivated'])

    @unittest.skip("Not implemented yet.")
    def test_datasets_active(self):
        raise NotImplementedError()

    def test_download_datasets(self):
        dids = [1, 2]
        datasets = self.connector.download_datasets(dids)
        self.assertEqual(len(datasets), 2)
        self.assertTrue(os.path.exists(os.path.join(
            self.connector.dataset_cache_dir, "1", "description.xml")))
        self.assertTrue(os.path.exists(os.path.join(
            self.connector.dataset_cache_dir, "2", "description.xml")))
        self.assertTrue(os.path.exists(os.path.join(
            self.connector.dataset_cache_dir, "1", "dataset.arff")))
        self.assertTrue(os.path.exists(os.path.join(
            self.connector.dataset_cache_dir, "2", "dataset.arff")))

    def test_download_dataset(self):
        dataset = self.connector.download_dataset(1)
        self.assertEqual(type(dataset), OpenMLDataset)
        self.assertEqual(dataset.name, 'anneal')
        self.assertTrue(os.path.exists(os.path.join(
            self.connector.dataset_cache_dir, "1", "description.xml")))
        self.assertTrue(os.path.exists(os.path.join(
            self.connector.dataset_cache_dir, "1", "dataset.arff")))

    def test_download_rowid(self):
        # Smoke test which checks that the dataset has the row-id set correctly
        did = 164
        dataset = self.connector.download_dataset(did)
        self.assertEqual(dataset.row_id_attribute, 'instance')

    def test_download_dataset_description(self):
        # Only a smoke test, I don't know exactly how to test the URL
        # retrieval and "caching"
        description = self.connector.download_dataset_description(2)
        self.assertIsInstance(description, dict)

    def test_download_dataset_features(self):
        # Only a smoke check
        features = self.connector.download_dataset_features(2)
        self.assertIsInstance(features, dict)

    def test_download_dataset_qualities(self):
        # Only a smoke check
        qualities = self.connector.download_dataset_qualities(2)
        self.assertIsInstance(qualities, dict)

    ############################################################################
    # Tasks
    def test_get_task_list(self):
        # We can only perform a smoke test here because we test on dynamic
        # data from the internet...
        def check_task(task):
            self.assertEqual(type(task), dict)
            self.assertGreaterEqual(len(task), 2)
            self.assertIn('did', task)
            self.assertIsInstance(task['did'], int)
            self.assertIn('status', task)
            self.assertTrue(is_string(task['status']))
            self.assertIn(task['status'],
                          ['in_preparation', 'active', 'deactivated'])

        tasks = self.connector.get_task_list(task_type_id=1)
        # 1759 as the number of supervised classification tasks retrieved
        # openml.org from this call; don't trust the number on openml.org as
        # it also counts private datasets
        self.assertGreaterEqual(len(tasks), 1759)
        for task in tasks:
            check_task(task)

        tasks = self.connector.get_task_list(task_type_id=2)
        self.assertGreaterEqual(len(tasks), 735)
        for task in tasks:
            check_task(task)

    def test_download_task(self):
        task = self.connector.download_task(1)
        self.assertTrue(os.path.exists(
            os.path.join(os.getcwd(), "tasks", "1", "task.xml")))
        self.assertTrue(os.path.exists(
            os.path.join(os.getcwd(), "tasks", "1", "datasplits.arff")))
        self.assertTrue(os.path.exists(
            os.path.join(os.getcwd(), "datasets", "1", "dataset.arff")))

    def test_download_split(self):
        task = self.connector.download_task(1)
        split = self.connector.download_split(task)
        self.assertEqual(type(split), OpenMLSplit)
        self.assertTrue(os.path.exists(
            os.path.join(os.getcwd(), "tasks", "1", "datasplits.arff")))

    ############################################################################
    # Runs
    @unittest.skip('The method which is tested by this function doesnt exist')
    def test_download_run_list(self):
        def check_run(run):
            self.assertIsInstance(run, dict)
            self.assertEqual(len(run), 6)

        runs = self.connector.get_runs_list(task_id=1)
        self.assertGreaterEqual(len(runs), 800)
        for run in runs:
            check_run(run)

        runs = self.connector.get_runs_list(flow_id=1)
        self.assertGreaterEqual(len(runs), 1)
        for run in runs:
            check_run(run)

        runs = self.connector.get_runs_list(setup_id=1)
        self.assertGreaterEqual(len(runs), 260)
        for run in runs:
            check_run(run)

    @unittest.skip('The method which is tested by this function doesnt exist')
    def test_download_run(self):
        run = self.connector.download_run(473350)
        self.assertGreaterEqual(len(run.tags), 2)
        self.assertEqual(len(run.datasets), 1)
        self.assertGreaterEqual(len(run.files), 2)
        self.assertGreaterEqual(len(run.evaluations), 18)
        self.assertEqual(len(run.evaluations['f_measure']), 2)

    # ###########################################################################
    # Flows
    @unittest.skip('The method which is tested by this function doesnt exist')
    def test_download_flow_list(self):
        def check_flow(flow):
            self.assertIsInstance(flow, dict)
            self.assertEqual(len(flow), 6)

        flows = self.connector.get_flow_list()
        self.assertGreaterEqual(len(flows), 1448)
        for flow in flows:
            check_flow(flow)

    def test_upload_dataset(self):

        dataset = self.connector.download_dataset(3)
        file_path = os.path.join(self.connector.dataset_cache_dir, "3", "dataset.arff")

        description = """ <oml:data_set_description xmlns:oml="http://openml.org/openml">
                        <oml:name>anneal</oml:name>
                        <oml:version>1</oml:version>
                        <oml:description>test</oml:description>
                        <oml:format>ARFF</oml:format>
                        <oml:licence>Public</oml:licence>
                        <oml:default_target_attribute>class</oml:default_target_attribute>
                        <oml:md5_checksum></oml:md5_checksum>
                        </oml:data_set_description>
                         """
        return_code, dataset_xml = self.connector.upload_dataset(description, file_path)
        self.assertEqual(return_code, 200)

    def test_upload_dataset_with_url(self):

        description = """ <oml:data_set_description xmlns:oml="http://openml.org/openml">
                        <oml:name>UploadTestWithURL</oml:name>
                        <oml:version>1</oml:version>
                        <oml:description>test</oml:description>
                        <oml:format>ARFF</oml:format>
                        <oml:url>http://expdb.cs.kuleuven.be/expdb/data/uci/nominal/iris.arff</oml:url>
                        </oml:data_set_description>
                         """
        return_code, dataset_xml = self.connector.upload_dataset(description)
        self.assertEqual(return_code, 200)

    def test_upload_flow(self):
        file_path = os.path.join(self.connector.dataset_cache_dir,"uploadflow.txt")
        file = open(file_path, "w")
        file.write("Testing upload flow")
        file.close()
        description = '''<oml:flow xmlns:oml="http://openml.org/openml"><oml:name>Test</oml:name><oml:description>description</oml:description> </oml:flow>'''
        return_code, dataset_xml = self.connector.upload_flow(description, file_path)
        self.assertEqual(return_code, 200)

    def test_upload_run(self):
        file = urlopen("http://www.openml.org/data/download/224/weka_generated_predictions1977525485999711307.arff")
        file_text = file.read()
        prediction_file_path = os.path.join(self.connector.dataset_cache_dir, "weka_generated_predictions1977525485999711307.arff")
        with open(prediction_file_path, "wb") as prediction_file:
            prediction_file.write(file_text)

        description_text = '''<oml:run xmlns:oml="http://openml.org/openml"><oml:task_id>59</oml:task_id><oml:flow_id>67</oml:flow_id></oml:run>'''
        description_path = os.path.join(self.connector.dataset_cache_dir, "description.xml")
        with open(description_path, "w") as description_file:
            description_file.write(description_text)

        return_code, dataset_xml = self.connector.upload_run(prediction_file_path, description_path)
        self.assertEqual(return_code, 200)

Пример #27

0

Показать файл

class TestAPIConnector(unittest.TestCase):
    """Test the APIConnector

    Note
    ----
    A config file with the username and password must be present to test the
    API calls.
    """
    def setUp(self):
        self.cwd = os.getcwd()
        workdir = os.path.dirname(os.path.abspath(__file__))
        self.workdir = os.path.join(workdir, "tmp")
        try:
            shutil.rmtree(self.workdir)
        except:
            pass

        os.mkdir(self.workdir)
        os.chdir(self.workdir)

        self.cached = True
        try:
            apikey = os.environ['OPENMLAPIKEY']
        except:
            apikey = None

        try:
            travis = os.environ['TRAVIS']
            if apikey is None:
                raise Exception('Running on travis-ci, but no environment '
                                'variable OPENMLAPIKEY found.')
        except:
            pass

        self.connector = APIConnector(cache_directory=self.workdir,
                                      apikey=apikey)

    def tearDown(self):
        os.chdir(self.cwd)
        shutil.rmtree(self.workdir)

    ############################################################################
    # Test administrative stuff
    @unittest.skip("Not implemented yet.")
    def test_parse_config(self):
        raise Exception()

    ############################################################################
    # Test all local stuff
    def test_get_cached_datasets(self):
        workdir = os.path.dirname(os.path.abspath(__file__))
        workdir = os.path.join(workdir, "files")
        connector = APIConnector(cache_directory=workdir)
        datasets = connector.get_cached_datasets()
        self.assertIsInstance(datasets, dict)
        self.assertEqual(len(datasets), 2)
        self.assertIsInstance(list(datasets.values())[0], OpenMLDataset)

    def test_get_cached_dataset(self):
        workdir = os.path.dirname(os.path.abspath(__file__))
        workdir = os.path.join(workdir, "files")

        with mock.patch.object(APIConnector, "_perform_api_call") as api_mock:
            api_mock.return_value = 400, \
                """<oml:authenticate xmlns:oml = "http://openml.org/openml">
                <oml:session_hash>G9MPPN114ZCZNWW2VN3JE9VF1FMV8Y5FXHUDUL4P</oml:session_hash>
                <oml:valid_until>2014-08-13 20:01:29</oml:valid_until>
                <oml:timezone>Europe/Berlin</oml:timezone>
                </oml:authenticate>"""

            connector = APIConnector(cache_directory=workdir)
            dataset = connector.get_cached_dataset(2)
            self.assertIsInstance(dataset, OpenMLDataset)
            self.assertTrue(connector._perform_api_call.is_called_once())

    def test_get_chached_dataset_description(self):
        workdir = os.path.dirname(os.path.abspath(__file__))
        workdir = os.path.join(workdir, "files")
        connector = APIConnector(cache_directory=workdir)
        description = connector._get_cached_dataset_description(2)
        self.assertIsInstance(description, dict)

    @unittest.skip("Not implemented yet.")
    def test_get_cached_tasks(self):
        raise Exception()

    @unittest.skip("Not implemented yet.")
    def test_get_cached_task(self):
        raise Exception()

    @unittest.skip("Not implemented yet.")
    def test_get_cached_splits(self):
        raise Exception()

    @unittest.skip("Not implemented yet.")
    def test_get_cached_split(self):
        raise Exception()

    ############################################################################
    # Test all remote stuff

    ############################################################################
    # Datasets
    def test_get_dataset_list(self):
        # We can only perform a smoke test here because we test on dynamic
        # data from the internet...
        datasets = self.connector.get_dataset_list()
        # 1087 as the number of datasets on openml.org
        self.assertTrue(len(datasets) >= 1087)
        for dataset in datasets:
            self.assertEqual(type(dataset), dict)
            self.assertGreaterEqual(len(dataset), 2)
            self.assertIn('did', dataset)
            self.assertIsInstance(dataset['did'], int)
            self.assertIn('status', dataset)
            self.assertTrue(is_string(dataset['status']))
            self.assertIn(dataset['status'],
                          ['in_preparation', 'active', 'deactivated'])

    @unittest.skip("Not implemented yet.")
    def test_datasets_active(self):
        raise NotImplementedError()

    def test_download_datasets(self):
        dids = [1, 2]
        datasets = self.connector.download_datasets(dids)
        self.assertEqual(len(datasets), 2)
        self.assertTrue(
            os.path.exists(
                os.path.join(self.connector.dataset_cache_dir, "1",
                             "description.xml")))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.connector.dataset_cache_dir, "2",
                             "description.xml")))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.connector.dataset_cache_dir, "1",
                             "dataset.arff")))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.connector.dataset_cache_dir, "2",
                             "dataset.arff")))

    def test_download_dataset(self):
        dataset = self.connector.download_dataset(1)
        self.assertEqual(type(dataset), OpenMLDataset)
        self.assertEqual(dataset.name, 'anneal')
        self.assertTrue(
            os.path.exists(
                os.path.join(self.connector.dataset_cache_dir, "1",
                             "description.xml")))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.connector.dataset_cache_dir, "1",
                             "dataset.arff")))

    def test_download_rowid(self):
        # Smoke test which checks that the dataset has the row-id set correctly
        did = 164
        dataset = self.connector.download_dataset(did)
        self.assertEqual(dataset.row_id_attribute, 'instance')

    def test_download_dataset_description(self):
        # Only a smoke test, I don't know exactly how to test the URL
        # retrieval and "caching"
        description = self.connector.download_dataset_description(2)
        self.assertIsInstance(description, dict)

    def test_download_dataset_features(self):
        # Only a smoke check
        features = self.connector.download_dataset_features(2)
        self.assertIsInstance(features, dict)

    def test_download_dataset_qualities(self):
        # Only a smoke check
        qualities = self.connector.download_dataset_qualities(2)
        self.assertIsInstance(qualities, dict)

    ############################################################################
    # Tasks
    def test_get_task_list(self):
        # We can only perform a smoke test here because we test on dynamic
        # data from the internet...
        def check_task(task):
            self.assertEqual(type(task), dict)
            self.assertGreaterEqual(len(task), 2)
            self.assertIn('did', task)
            self.assertIsInstance(task['did'], int)
            self.assertIn('status', task)
            self.assertTrue(is_string(task['status']))
            self.assertIn(task['status'],
                          ['in_preparation', 'active', 'deactivated'])

        tasks = self.connector.get_task_list(task_type_id=1)
        # 1759 as the number of supervised classification tasks retrieved
        # openml.org from this call; don't trust the number on openml.org as
        # it also counts private datasets
        self.assertGreaterEqual(len(tasks), 1759)
        for task in tasks:
            check_task(task)

        tasks = self.connector.get_task_list(task_type_id=2)
        self.assertGreaterEqual(len(tasks), 735)
        for task in tasks:
            check_task(task)

    def test_download_task(self):
        task = self.connector.download_task(1)
        self.assertTrue(
            os.path.exists(os.path.join(os.getcwd(), "tasks", "1",
                                        "task.xml")))
        self.assertTrue(
            os.path.exists(
                os.path.join(os.getcwd(), "tasks", "1", "datasplits.arff")))
        self.assertTrue(
            os.path.exists(
                os.path.join(os.getcwd(), "datasets", "1", "dataset.arff")))

    def test_download_split(self):
        task = self.connector.download_task(1)
        split = self.connector.download_split(task)
        self.assertEqual(type(split), OpenMLSplit)
        self.assertTrue(
            os.path.exists(
                os.path.join(os.getcwd(), "tasks", "1", "datasplits.arff")))

    ############################################################################
    # Runs
    @unittest.skip('The method which is tested by this function doesnt exist')
    def test_download_run_list(self):
        def check_run(run):
            self.assertIsInstance(run, dict)
            self.assertEqual(len(run), 6)

        runs = self.connector.get_runs_list(task_id=1)
        self.assertGreaterEqual(len(runs), 800)
        for run in runs:
            check_run(run)

        runs = self.connector.get_runs_list(flow_id=1)
        self.assertGreaterEqual(len(runs), 1)
        for run in runs:
            check_run(run)

        runs = self.connector.get_runs_list(setup_id=1)
        self.assertGreaterEqual(len(runs), 260)
        for run in runs:
            check_run(run)

    @unittest.skip('The method which is tested by this function doesnt exist')
    def test_download_run(self):
        run = self.connector.download_run(473350)
        self.assertGreaterEqual(len(run.tags), 2)
        self.assertEqual(len(run.datasets), 1)
        self.assertGreaterEqual(len(run.files), 2)
        self.assertGreaterEqual(len(run.evaluations), 18)
        self.assertEqual(len(run.evaluations['f_measure']), 2)

    # ###########################################################################
    # Flows
    @unittest.skip('The method which is tested by this function doesnt exist')
    def test_download_flow_list(self):
        def check_flow(flow):
            self.assertIsInstance(flow, dict)
            self.assertEqual(len(flow), 6)

        flows = self.connector.get_flow_list()
        self.assertGreaterEqual(len(flows), 1448)
        for flow in flows:
            check_flow(flow)

    def test_upload_dataset(self):

        dataset = self.connector.download_dataset(3)
        file_path = os.path.join(self.connector.dataset_cache_dir, "3",
                                 "dataset.arff")

        description = """ <oml:data_set_description xmlns:oml="http://openml.org/openml">
                        <oml:name>anneal</oml:name>
                        <oml:version>1</oml:version>
                        <oml:description>test</oml:description>
                        <oml:format>ARFF</oml:format>
                        <oml:licence>Public</oml:licence>
                        <oml:default_target_attribute>class</oml:default_target_attribute>
                        <oml:md5_checksum></oml:md5_checksum>
                        </oml:data_set_description>
                         """
        return_code, dataset_xml = self.connector.upload_dataset(
            description, file_path)
        self.assertEqual(return_code, 200)

    def test_upload_dataset_with_url(self):

        description = """ <oml:data_set_description xmlns:oml="http://openml.org/openml">
                        <oml:name>UploadTestWithURL</oml:name>
                        <oml:version>1</oml:version>
                        <oml:description>test</oml:description>
                        <oml:format>ARFF</oml:format>
                        <oml:url>http://expdb.cs.kuleuven.be/expdb/data/uci/nominal/iris.arff</oml:url>
                        </oml:data_set_description>
                         """
        return_code, dataset_xml = self.connector.upload_dataset(description)
        self.assertEqual(return_code, 200)

    def test_upload_flow(self):
        file_path = os.path.join(self.connector.dataset_cache_dir,
                                 "uploadflow.txt")
        file = open(file_path, "w")
        file.write("Testing upload flow")
        file.close()
        description = '''<oml:flow xmlns:oml="http://openml.org/openml"><oml:name>Test</oml:name><oml:description>description</oml:description> </oml:flow>'''
        return_code, dataset_xml = self.connector.upload_flow(
            description, file_path)
        self.assertEqual(return_code, 200)

    def test_upload_run(self):
        file = urlopen(
            "http://www.openml.org/data/download/224/weka_generated_predictions1977525485999711307.arff"
        )
        file_text = file.read()
        prediction_file_path = os.path.join(
            self.connector.dataset_cache_dir,
            "weka_generated_predictions1977525485999711307.arff")
        with open(prediction_file_path, "wb") as prediction_file:
            prediction_file.write(file_text)

        description_text = '''<oml:run xmlns:oml="http://openml.org/openml"><oml:task_id>59</oml:task_id><oml:flow_id>67</oml:flow_id></oml:run>'''
        description_path = os.path.join(self.connector.dataset_cache_dir,
                                        "description.xml")
        with open(description_path, "w") as description_file:
            description_file.write(description_text)

        return_code, dataset_xml = self.connector.upload_run(
            prediction_file_path, description_path)
        self.assertEqual(return_code, 200)

Пример #28

0

Показать файл

Файл: heat.py Проект: palindrome6/Data-Mining---Assignment-2

print(__doc__)

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
from openml.apiconnector import APIConnector
import os
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.grid_search import GridSearchCV
import math
apikey = 'fbc6d4b7868ce52640f6ec74cf076f48'
connector = APIConnector(apikey=apikey)
#loading data
dataset = connector.download_dataset(59)
# Utility function to move the midpoint of a colormap to be around

# the values of interest.

class MidpointNormalize(Normalize):

    def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
        self.midpoint = midpoint
        Normalize.__init__(self, vmin, vmax, clip)

    def __call__(self, value, clip=None):
        x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
        return np.ma.masked_array(np.interp(value, x, y))

Пример #29

0

Показать файл

from openml.autorun import openml_run
from sklearn import ensemble
import xmltodict
import os
"""
An example of an automated machine learning experiment using openml_run
"""

key_file_path = "apikey.txt"
with open(key_file_path, 'r') as fh:
    key = fh.readline()

task_id = 59

clf = ensemble.RandomForestClassifier()
connector = APIConnector(apikey=key)
task = connector.download_task(task_id)

prediction_path, description_path = openml_run(task, clf)

prediction_abspath = os.path.abspath(prediction_path)
description_abspath = os.path.abspath(description_path)

return_code, response = connector.upload_run(prediction_abspath,
                                             description_abspath)

if (return_code == 200):
    response_dict = xmltodict.parse(response.content)
    run_id = response_dict['oml:upload_run']['oml:run_id']
    print("Uploaded run with id %s" % (run_id))

Пример #30

0

Показать файл

 def test_get_chached_dataset_description(self):
     workdir = os.path.dirname(os.path.abspath(__file__))
     workdir = os.path.join(workdir, "files")
     connector = APIConnector(cache_directory=workdir)
     description = connector._get_cached_dataset_description(2)
     self.assertIsInstance(description, dict)

Пример #31

0

Показать файл

Файл: decision_tree.py Проект: ericvui/DataMining-1

import numpy as np
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier, export_graphviz
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import fbeta_score, confusion_matrix, roc_curve, get_scorer
from subprocess import check_output

home_dir = os.path.expanduser("~")
openml_dir = os.path.join(home_dir, ".openml")
cache_dir = os.path.join(openml_dir, "cache")

with open(os.path.join(openml_dir, "apikey.txt"), 'r') as fh:
    key = fh.readline().rstrip('\n')
fh.close()

## load dataset lists
openml = APIConnector(cache_directory = cache_dir, apikey = key)
# datasets = openml.get_dataset_list()
# data = pd.DataFrame(datasets)

dataset = openml.download_dataset(10)
# print('Data-set name: %s'%dataset.name)
# print(dataset.description)
data, meta = loadarff(dataset.data_file)
target_attribute = dataset.default_target_attribute
target_attribute_names = meta[target_attribute][1]
X, y, attribute_names = dataset.get_dataset(target = target_attribute, return_attribute_names = True)
y_values = np.unique(y)
print('y_values%s'%y_values)
fig, axes_bar = plt.subplots(1, 1)
# plot the distribution of target attribute
y_values_counts, bin_edges = np.histogram(y, y_values.size, density = False)

Пример #32

0

Показать файл

Файл: openml_run_example.py Проект: brylie/openml-python

from openml.apiconnector import APIConnector
from openml.autorun import openml_run
from sklearn import ensemble
import xmltodict
import os
"""
An example of an automated machine learning experiment using openml_run
"""

key_file_path = "apikey.txt"
with open(key_file_path, 'r') as fh:
	key = fh.readline()

task_id = 59

clf = ensemble.RandomForestClassifier()
connector = APIConnector(apikey = key)
task = connector.download_task(task_id)

prediction_path, description_path = openml_run(task, clf)

prediction_abspath = os.path.abspath(prediction_path)
description_abspath = os.path.abspath(description_path)

return_code, response = connector.upload_run(prediction_abspath, description_abspath)

if(return_code == 200):
	response_dict = xmltodict.parse(response.content)
	run_id = response_dict['oml:upload_run']['oml:run_id']
	print("Uploaded run with id %s" % (run_id))

Python APIConnector примеры использования