예제 #1
0
def fetch(dataset_name,
          task_type,
          verbose=False,
          preprocess=True,
          test_size=0.33,
          astype=None):
    if verbose:
        print("Loading dataset:", dataset_name)
    # Check that the dataset name exists in experiments_dict
    try:
        if experiments_dict[dataset_name]["task_type"] != task_type.lower():
            raise ValueError(
                "The task type {} does not match with the given datasets task type {}"
                .format(task_type,
                        experiments_dict[dataset_name]["task_type"]))

    except KeyError:
        raise KeyError(
            "Dataset name {} not found in the supported datasets".format(
                dataset_name))
    data_file_name = os.path.join(download_data_dir, dataset_name + ".arff")
    if verbose:
        print(f"data file name: {data_file_name}")
    if not os.path.exists(data_file_name):
        # TODO: Download the data
        if not os.path.exists(download_data_dir):
            os.makedirs(download_data_dir)
            if verbose:
                print("created directory {}".format(download_data_dir))
        urllib.request.urlretrieve(
            experiments_dict[dataset_name]["download_arff_url"],
            data_file_name)

    assert os.path.exists(data_file_name)
    with open(data_file_name) as f:
        dataDictionary = arff.load(f)
        f.close()

    from lale.datasets.data_schemas import liac_arff_to_schema

    schema_orig = liac_arff_to_schema(dataDictionary)
    target_col = experiments_dict[dataset_name]["target"]
    y: Optional[Any] = None
    if preprocess:
        arffData = pd.DataFrame(dataDictionary["data"])
        # arffData = arffData.fillna(0)
        attributes = dataDictionary["attributes"]

        if verbose:
            print(f"attributes: {attributes}")
        categorical_cols = []
        numeric_cols = []
        X_columns = []
        for i, item in enumerate(attributes):
            if item[0].lower() == target_col:
                target_indx = i
                # remove it from attributes so that the next loop indices are adjusted accordingly.
                del attributes[i]
                # the type stubs for pandas are not currently complete enough to type this correctly
                y = arffData.iloc[:, target_indx]  # type: ignore
                arffData = arffData.drop(i, axis=1)

        for i, item in enumerate(attributes):
            X_columns.append(i)
            if ((isinstance(item[1], str) and
                 item[1].lower() not in numeric_data_types_list) or isinstance(
                     item[1], list)) and (item[0].lower() != "class"):
                categorical_cols.append(i)
            elif (isinstance(item[1], str) and item[1].lower()
                  in numeric_data_types_list) and (item[0].lower() != "class"):
                numeric_cols.append(i)
        if verbose:
            print(f"categorical columns: {categorical_cols}")
            print(f"numeric columns:     {numeric_cols}")
        X = arffData.iloc[:, X_columns]

        # Check whether there is any error
        num_classes_from_last_row = len(list(set(y))) if y is not None else 0

        if verbose:
            print("num_classes_from_last_row", num_classes_from_last_row)

        transformers1 = [
            (
                "imputer_str",
                SimpleImputer(missing_values=None, strategy="most_frequent"),
                categorical_cols,
            ),
            ("imputer_num", SimpleImputer(strategy="mean"), numeric_cols),
        ]
        txm1 = ColumnTransformer(transformers1, sparse_threshold=0.0)

        transformers2 = [
            ("ohe", OneHotEncoder(sparse=False),
             list(range(len(categorical_cols)))),
            (
                "no_op",
                "passthrough",
                list(
                    range(len(categorical_cols),
                          len(categorical_cols) + len(numeric_cols))),
            ),
        ]
        txm2 = ColumnTransformer(transformers2, sparse_threshold=0.0)
        if verbose:
            print("Shape of X before preprocessing", X.shape)
        from sklearn.pipeline import make_pipeline

        preprocessing = make_pipeline(txm1, txm2)

        X = preprocessing.fit(X).transform(X)

        if verbose:
            print(f"shape of X after preprocessing: {X.shape}")

        if astype == "pandas":
            cat_col_names = [
                attributes[i][0].lower() for i in categorical_cols
            ]
            one_hot_encoder = preprocessing.steps[1][1].named_transformers_[
                "ohe"]
            encoded_names = one_hot_encoder.get_feature_names(cat_col_names)
            num_col_names = [attributes[i][0].lower() for i in numeric_cols]
            col_names = list(encoded_names) + list(num_col_names)
            if verbose:
                print(f"column names after preprocessing: {col_names}")
            X = pd.DataFrame(X, columns=col_names)

    else:
        col_names = [attr[0].lower() for attr in dataDictionary["attributes"]]
        df_all = pd.DataFrame(dataDictionary["data"], columns=col_names)
        y = df_all[target_col]
        # the type stubs for pandas are not currently complete enough to type this correctly
        y = y.squeeze()  # type: ignore
        cols_X = [col for col in col_names if col != target_col]
        X = df_all[cols_X]

    if preprocess:
        labelencoder = LabelEncoder()
        y = labelencoder.fit_transform(y)
    if astype == "pandas" and not isinstance(y, pd.Series):
        y = pd.Series(y, name=target_col)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=test_size,
                                                        random_state=0)
    if verbose:
        print(f"training set shapes: X {X_train.shape}, y {y_train.shape}")
        print(f"test set shapes:     X {X_test.shape}, y {y_test.shape}")
    if preprocess:
        from lale.datasets.data_schemas import add_schema

        X_train = add_schema(X_train.astype(np.number), recalc=True)
        y_train = add_schema(y_train.astype(np.int), recalc=True)
        X_test = add_schema(X_test.astype(np.number), recalc=True)
        y_test = add_schema(y_test.astype(np.int), recalc=True)
    else:
        X_train, X_test, y_train, y_test = add_schemas(schema_orig, target_col,
                                                       X_train, X_test,
                                                       y_train, y_test)
    return (X_train, y_train), (X_test, y_test)
예제 #2
0
def fetch(dataset_name, task_type, verbose=False, preprocess=True):
    if verbose:
        print('Loading dataset:', dataset_name)
    #Check that the dataset name exists in experiments_dict
    try:
        dataset_name_found = experiments_dict[dataset_name]
        if experiments_dict[dataset_name]['task_type'] != task_type.lower():
            raise ValueError("The task type {} does not match with the given datasets task type {}"\
                .format(task_type, experiments_dict[dataset_name]['task_type']))

    except KeyError:
        raise KeyError("Dataset name {} not found in the supported datasets".format(dataset_name))
    data_file_name = os.path.join(download_data_dir, dataset_name+".arff")
    if verbose:
        print(data_file_name)
    if not os.path.exists(data_file_name):
        #TODO: Download the data
        if not os.path.exists(download_data_dir):
            os.makedirs(download_data_dir)
            if verbose:
                print('created directory {}'.format(download_data_dir))
        urllib.request.urlretrieve(experiments_dict[dataset_name]['download_arff_url'], data_file_name)

    assert os.path.exists(data_file_name)
    with open(data_file_name) as f:
        dataDictionary = arff.load(f)
        f.close()

    from lale.datasets.data_schemas import liac_arff_to_schema
    schema_orig = liac_arff_to_schema(dataDictionary)
    target_col = experiments_dict[dataset_name]['target']
    if preprocess:
        arffData = pd.DataFrame(dataDictionary['data'])
        #arffData = arffData.fillna(0)
        attributes = dataDictionary['attributes']

        if verbose:
            print(attributes)
        categorical_cols = []
        numeric_cols = []
        X_columns = []
        for i, item in enumerate(attributes):
            if item[0].lower() == target_col:
                target_indx = i
                #remove it from attributes so that the next loop indices are adjusted accordingly.
                del attributes[i]
                y = arffData.iloc[:,target_indx]
                arffData = arffData.drop(i, axis = 1)

        for i, item in enumerate(attributes):
            X_columns.append(i)
            if (((isinstance(item[1], str) and item[1].lower() not in numeric_data_types_list) \
                or isinstance(item[1], list)) and (item[0].lower() != 'class')):
                categorical_cols.append(i)
            elif (isinstance(item[1], str) and item[1].lower() in numeric_data_types_list) and (item[0].lower() != 'class'):
                numeric_cols.append(i)
        if verbose:
            print(f'categorical columns: {categorical_cols}')
            print(f'numeric columns:     {numeric_cols}')
        X = arffData.iloc[:,X_columns]

        #Check whether there is any error
        num_classes_from_last_row = len(list(set(y)))

        if verbose:
            print('num_classes_from_last_row', num_classes_from_last_row)

        transformers1 = [
            ( 'imputer_str',
              SimpleImputer(missing_values=None, strategy='most_frequent'),
              categorical_cols),
            ( 'imputer_num',
              SimpleImputer(strategy='mean'), numeric_cols)]
        txm1 = ColumnTransformer(transformers1, sparse_threshold=0.0)

        transformers2 = [
            ( 'ohe', OneHotEncoder(sparse=False),
              list(range(len(categorical_cols)))),
            ( 'no_op', 'passthrough',
              list(range(len(categorical_cols),
                         len(categorical_cols) + len(numeric_cols))))]
        txm2 = ColumnTransformer(transformers2, sparse_threshold=0.0)
        if verbose:
            print("Shape of X before preprocessing", X.shape)
        from lale.operators import make_pipeline
        preprocessing = make_pipeline(txm1, txm2)

        X = preprocessing.fit(X).transform(X)
        if verbose:
            print("Shape of X after preprocessing", X.shape)

    else:
        col_names = [attr[0] for attr in dataDictionary['attributes']]
        df_all = pd.DataFrame(dataDictionary['data'], columns=col_names)
        y = df_all[target_col]
        y = y.squeeze()
        cols_X = [col for col in col_names if col != target_col]
        X = df_all[cols_X]

    labelencoder = LabelEncoder()
    y = labelencoder.fit_transform(y)

    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size = 0.33, random_state = 0)
    if verbose:
        print(f'training set shapes: X {X_train.shape}, y {y_train.shape}')
        print(f'test set shapes:     X {X_test.shape}, y {y_test.shape}')
    X_train, X_test, y_train, y_test = add_schemas( \
        schema_orig, target_col, X_train, X_test, y_train, y_test)
    return (X_train, y_train), (X_test, y_test)