示例#1
0
    def test_with_concat_features2(self):
        import warnings

        warnings.filterwarnings("ignore")

        from sklearn.datasets import load_iris
        from sklearn.metrics import accuracy_score

        from lale.lib.lale import Hyperopt

        data = load_iris()
        X, y = data.data, data.target
        pca = PCA(n_components=3)
        nys = Nystroem(n_components=10)
        concat = ConcatFeatures()
        lr = LogisticRegression(random_state=42, C=0.1)
        from lale.operators import make_pipeline

        pipeline = make_pipeline(
            ((((SimpleImputer() | NoOp()) >> pca) & nys) >> concat >> lr)
            | KNeighborsClassifier()
        )
        clf = Hyperopt(estimator=pipeline, max_evals=1, handle_cv_failure=True)
        trained = clf.fit(X, y)
        predictions = trained.predict(X)
        print(accuracy_score(y, predictions))
        warnings.resetwarnings()
示例#2
0
 def _fit_gbt_num(self, X, y):
     from lale.lib.lale import Project
     from lale.lib.sklearn import SimpleImputer
     gbt = auto_gbt(self.prediction_type)
     trainable = (Project(columns={'type': 'number'}) >>
                  SimpleImputer(strategy='mean') >> gbt())
     self._try_and_add('gbt_num', trainable, X, y)
示例#3
0
    def _fit_gbt_num(self, X, y):
        from lale.lib.lale import Project
        from lale.lib.sklearn import SimpleImputer

        gbt = auto_gbt(self.prediction_type)
        trainable = (Project(columns={"type": "number"}) >>
                     SimpleImputer(strategy="mean") >> gbt())
        self._try_and_add("gbt_num", trainable, X, y)
示例#4
0
 def test_pipeline_AWTTR_1(self):
     trainable = AutoaiTSPipeline(steps=[(
         "AutoaiWindowTransformedTargetRegressor",
         AutoaiWindowTransformedTargetRegressor(
             regressor=SmallDataWindowTransformer() >> SimpleImputer() >>
             RandomForestRegressor()),
     )])
     self.doTestPipeline(trainable, self.y, self.y, self.y, self.y)
示例#5
0
def auto_prep(X):
    from lale.lib.lale import ConcatFeatures, Project, categorical
    from lale.lib.sklearn import OneHotEncoder, SimpleImputer

    n_cols = X.shape[1]
    n_cats = len(categorical()(X))
    prep_num = SimpleImputer(strategy="mean")
    prep_cat = SimpleImputer(strategy="most_frequent") >> OneHotEncoder(
        handle_unknown="ignore")
    if n_cats == 0:
        result = prep_num
    elif n_cats == n_cols:
        result = prep_cat
    else:
        result = (
            (Project(columns={"type": "number"}, drop_columns=categorical()) >>
             prep_num)
            & (Project(columns=categorical()) >> prep_cat)) >> ConcatFeatures
    return result
示例#6
0
 def test_nested_pipeline1(self):
     from sklearn.datasets import load_iris
     from lale.lib.lale import Hyperopt
     from sklearn.metrics import accuracy_score
     data = load_iris()
     X, y = data.data, data.target
     #pipeline = KNeighborsClassifier() | (OneHotEncoder(handle_unknown = 'ignore') >> LogisticRegression())
     pipeline = KNeighborsClassifier() | (SimpleImputer() >> LogisticRegression())
     clf = Hyperopt(estimator=pipeline, max_evals=1)
     trained = clf.fit(X, y)
     predictions = trained.predict(X)
     print(accuracy_score(y, predictions))
示例#7
0
def auto_prep(X):
    from lale.lib.lale import ConcatFeatures
    from lale.lib.lale import Project
    from lale.lib.lale import categorical
    from lale.lib.sklearn import OneHotEncoder
    from lale.lib.sklearn import SimpleImputer
    n_cols = X.shape[1]
    n_cats = len(categorical()(X))
    prep_num = SimpleImputer(strategy='mean')
    prep_cat = (SimpleImputer(strategy='most_frequent') >>
                OneHotEncoder(handle_unknown='ignore'))
    if n_cats == 0:
        result = prep_num
    elif n_cats == n_cols:
        result = prep_cat
    else:
        result = (
            (Project(columns={'type': 'number'}, drop_columns=categorical()) >>
             prep_num)
            & (Project(columns=categorical()) >> prep_cat)) >> ConcatFeatures
    return result
示例#8
0
 def test_pipeline_AWWR(self):
     trainable = AutoaiTSPipeline(steps=[(
         "AutoaiWindowTransformedTargetRegressor",
         AutoaiWindowedWrappedRegressor(
             regressor=SmallDataWindowTransformer() >> SimpleImputer() >>
             RandomForestRegressor()),
     )])
     self.doTestPipeline(trainable,
                         self.y,
                         self.y,
                         self.y,
                         self.y,
                         optimization=True)
示例#9
0
 def test_pipeline_AWTTR_2(self):
     trainable = AutoaiTSPipeline(steps=[(
         "AutoaiWindowTransformedTargetRegressor",
         AutoaiWindowTransformedTargetRegressor(
             regressor=SmallDataWindowTransformer() >> SimpleImputer() >>
             RandomForestRegressor(),
             estimator_prediction_type="rowwise",
         ),
     )])
     self.doTestPipeline(trainable,
                         self.y,
                         self.y,
                         self.y,
                         self.y,
                         optimization=True)
示例#10
0
def fetch(dataset_name, task_type, verbose=False, preprocess=True):
    if verbose:
        print('Loading dataset:', dataset_name)
    #Check that the dataset name exists in experiments_dict
    try:
        dataset_name_found = experiments_dict[dataset_name]
        if experiments_dict[dataset_name]['task_type'] != task_type.lower():
            raise ValueError("The task type {} does not match with the given datasets task type {}"\
                .format(task_type, experiments_dict[dataset_name]['task_type']))

    except KeyError:
        raise KeyError("Dataset name {} not found in the supported datasets".format(dataset_name))
    data_file_name = os.path.join(download_data_dir, dataset_name+".arff")
    if verbose:
        print(data_file_name)
    if not os.path.exists(data_file_name):
        #TODO: Download the data
        if not os.path.exists(download_data_dir):
            os.makedirs(download_data_dir)
            if verbose:
                print('created directory {}'.format(download_data_dir))
        urllib.request.urlretrieve(experiments_dict[dataset_name]['download_arff_url'], data_file_name)

    assert os.path.exists(data_file_name)
    with open(data_file_name) as f:
        dataDictionary = arff.load(f)
        f.close()

    from lale.datasets.data_schemas import liac_arff_to_schema
    schema_orig = liac_arff_to_schema(dataDictionary)
    target_col = experiments_dict[dataset_name]['target']
    if preprocess:
        arffData = pd.DataFrame(dataDictionary['data'])
        #arffData = arffData.fillna(0)
        attributes = dataDictionary['attributes']

        if verbose:
            print(attributes)
        categorical_cols = []
        numeric_cols = []
        X_columns = []
        for i, item in enumerate(attributes):
            if item[0].lower() == target_col:
                target_indx = i
                #remove it from attributes so that the next loop indices are adjusted accordingly.
                del attributes[i]
                y = arffData.iloc[:,target_indx]
                arffData = arffData.drop(i, axis = 1)

        for i, item in enumerate(attributes):
            X_columns.append(i)
            if (((isinstance(item[1], str) and item[1].lower() not in numeric_data_types_list) \
                or isinstance(item[1], list)) and (item[0].lower() != 'class')):
                categorical_cols.append(i)
            elif (isinstance(item[1], str) and item[1].lower() in numeric_data_types_list) and (item[0].lower() != 'class'):
                numeric_cols.append(i)
        if verbose:
            print(f'categorical columns: {categorical_cols}')
            print(f'numeric columns:     {numeric_cols}')
        X = arffData.iloc[:,X_columns]

        #Check whether there is any error
        num_classes_from_last_row = len(list(set(y)))

        if verbose:
            print('num_classes_from_last_row', num_classes_from_last_row)

        transformers1 = [
            ( 'imputer_str',
              SimpleImputer(missing_values=None, strategy='most_frequent'),
              categorical_cols),
            ( 'imputer_num',
              SimpleImputer(strategy='mean'), numeric_cols)]
        txm1 = ColumnTransformer(transformers1, sparse_threshold=0.0)

        transformers2 = [
            ( 'ohe', OneHotEncoder(sparse=False),
              list(range(len(categorical_cols)))),
            ( 'no_op', 'passthrough',
              list(range(len(categorical_cols),
                         len(categorical_cols) + len(numeric_cols))))]
        txm2 = ColumnTransformer(transformers2, sparse_threshold=0.0)
        if verbose:
            print("Shape of X before preprocessing", X.shape)
        from lale.operators import make_pipeline
        preprocessing = make_pipeline(txm1, txm2)

        X = preprocessing.fit(X).transform(X)
        if verbose:
            print("Shape of X after preprocessing", X.shape)

    else:
        col_names = [attr[0] for attr in dataDictionary['attributes']]
        df_all = pd.DataFrame(dataDictionary['data'], columns=col_names)
        y = df_all[target_col]
        y = y.squeeze()
        cols_X = [col for col in col_names if col != target_col]
        X = df_all[cols_X]

    labelencoder = LabelEncoder()
    y = labelencoder.fit_transform(y)

    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size = 0.33, random_state = 0)
    if verbose:
        print(f'training set shapes: X {X_train.shape}, y {y_train.shape}')
        print(f'test set shapes:     X {X_test.shape}, y {y_test.shape}')
    X_train, X_test, y_train, y_test = add_schemas( \
        schema_orig, target_col, X_train, X_test, y_train, y_test)
    return (X_train, y_train), (X_test, y_test)
示例#11
0
    def test_simple_imputer(self):
        from lale.lib.sklearn import SimpleImputer

        reg = SimpleImputer(strategy='mean', fill_value=10)
        reg.fit(self.X_train, self.y_train)
示例#12
0
 def test_simple_imputer(self):
     reg = SimpleImputer(strategy="mean", fill_value=10)
     reg.fit(self.X_train, self.y_train)