def add_schemas(schema_orig, target_col, train_X, test_X, train_y, test_y): from lale.datasets.data_schemas import add_schema elems_X = [item_schema for item_schema in schema_orig['items']['items'] if item_schema['description'] != target_col] elem_y = [item_schema for item_schema in schema_orig['items']['items'] if item_schema['description'] == target_col][0] if 'enum' in elem_y: elem_y['enum'] = [*range(len(elem_y['enum']))] ncols_X = len(elems_X) rows_X = { **schema_orig['items'], 'minItems': ncols_X, 'maxItems': ncols_X, 'items': elems_X} if 'json_schema' not in pd.DataFrame._internal_names: pd.DataFrame._internal_names.append('json_schema') nrows_train, nrows_test = len(train_y), len(test_y) train_X = add_schema(train_X, { **schema_orig, 'minItems': nrows_train, 'maxItems': nrows_train, 'items': rows_X}) test_X = add_schema(test_X, { **schema_orig, 'minItems': nrows_test, 'maxItems': nrows_test, 'items': rows_X}) train_y = add_schema(train_y, { **schema_orig, 'minItems': nrows_train, 'maxItems': nrows_train, 'items': elem_y}) test_y = add_schema(test_y, { **schema_orig, 'minItems': nrows_test, 'maxItems': nrows_test, 'items': elem_y}) return train_X, test_X, train_y, test_y
def add_schemas(schema_orig, target_col, train_X, test_X, train_y, test_y): from lale.datasets.data_schemas import add_schema elems_X = [ item_schema for item_schema in schema_orig["items"]["items"] if item_schema["description"].lower() != target_col ] elem_y = [ item_schema for item_schema in schema_orig["items"]["items"] if item_schema["description"].lower() == target_col ][0] if "enum" in elem_y: elem_y["enum"] = [*range(len(elem_y["enum"]))] ncols_X = len(elems_X) rows_X = { **schema_orig["items"], "minItems": ncols_X, "maxItems": ncols_X, "items": elems_X, } if "json_schema" not in pd.DataFrame._internal_names: pd.DataFrame._internal_names.append("json_schema") nrows_train, nrows_test = len(train_y), len(test_y) train_X = add_schema( train_X, { **schema_orig, "minItems": nrows_train, "maxItems": nrows_train, "items": rows_X, }, ) test_X = add_schema( test_X, { **schema_orig, "minItems": nrows_test, "maxItems": nrows_test, "items": rows_X, }, ) train_y = add_schema( train_y, { **schema_orig, "minItems": nrows_train, "maxItems": nrows_train, "items": elem_y, }, ) test_y = add_schema( test_y, { **schema_orig, "minItems": nrows_test, "maxItems": nrows_test, "items": elem_y, }, ) return train_X, test_X, train_y, test_y
def fetch(dataset_name, task_type, verbose=False, preprocess=True, test_size=0.33, astype=None): if verbose: print("Loading dataset:", dataset_name) # Check that the dataset name exists in experiments_dict try: if experiments_dict[dataset_name]["task_type"] != task_type.lower(): raise ValueError( "The task type {} does not match with the given datasets task type {}" .format(task_type, experiments_dict[dataset_name]["task_type"])) except KeyError: raise KeyError( "Dataset name {} not found in the supported datasets".format( dataset_name)) data_file_name = os.path.join(download_data_dir, dataset_name + ".arff") if verbose: print(f"data file name: {data_file_name}") if not os.path.exists(data_file_name): # TODO: Download the data if not os.path.exists(download_data_dir): os.makedirs(download_data_dir) if verbose: print("created directory {}".format(download_data_dir)) urllib.request.urlretrieve( experiments_dict[dataset_name]["download_arff_url"], data_file_name) assert os.path.exists(data_file_name) with open(data_file_name) as f: dataDictionary = arff.load(f) f.close() from lale.datasets.data_schemas import liac_arff_to_schema schema_orig = liac_arff_to_schema(dataDictionary) target_col = experiments_dict[dataset_name]["target"] y: Optional[Any] = None if preprocess: arffData = pd.DataFrame(dataDictionary["data"]) # arffData = arffData.fillna(0) attributes = dataDictionary["attributes"] if verbose: print(f"attributes: {attributes}") categorical_cols = [] numeric_cols = [] X_columns = [] for i, item in enumerate(attributes): if item[0].lower() == target_col: target_indx = i # remove it from attributes so that the next loop indices are adjusted accordingly. del attributes[i] # the type stubs for pandas are not currently complete enough to type this correctly y = arffData.iloc[:, target_indx] # type: ignore arffData = arffData.drop(i, axis=1) for i, item in enumerate(attributes): X_columns.append(i) if ((isinstance(item[1], str) and item[1].lower() not in numeric_data_types_list) or isinstance( item[1], list)) and (item[0].lower() != "class"): categorical_cols.append(i) elif (isinstance(item[1], str) and item[1].lower() in numeric_data_types_list) and (item[0].lower() != "class"): numeric_cols.append(i) if verbose: print(f"categorical columns: {categorical_cols}") print(f"numeric columns: {numeric_cols}") X = arffData.iloc[:, X_columns] # Check whether there is any error num_classes_from_last_row = len(list(set(y))) if y is not None else 0 if verbose: print("num_classes_from_last_row", num_classes_from_last_row) transformers1 = [ ( "imputer_str", SimpleImputer(missing_values=None, strategy="most_frequent"), categorical_cols, ), ("imputer_num", SimpleImputer(strategy="mean"), numeric_cols), ] txm1 = ColumnTransformer(transformers1, sparse_threshold=0.0) transformers2 = [ ("ohe", OneHotEncoder(sparse=False), list(range(len(categorical_cols)))), ( "no_op", "passthrough", list( range(len(categorical_cols), len(categorical_cols) + len(numeric_cols))), ), ] txm2 = ColumnTransformer(transformers2, sparse_threshold=0.0) if verbose: print("Shape of X before preprocessing", X.shape) from sklearn.pipeline import make_pipeline preprocessing = make_pipeline(txm1, txm2) X = preprocessing.fit(X).transform(X) if verbose: print(f"shape of X after preprocessing: {X.shape}") if astype == "pandas": cat_col_names = [ attributes[i][0].lower() for i in categorical_cols ] one_hot_encoder = preprocessing.steps[1][1].named_transformers_[ "ohe"] encoded_names = one_hot_encoder.get_feature_names(cat_col_names) num_col_names = [attributes[i][0].lower() for i in numeric_cols] col_names = list(encoded_names) + list(num_col_names) if verbose: print(f"column names after preprocessing: {col_names}") X = pd.DataFrame(X, columns=col_names) else: col_names = [attr[0].lower() for attr in dataDictionary["attributes"]] df_all = pd.DataFrame(dataDictionary["data"], columns=col_names) y = df_all[target_col] # the type stubs for pandas are not currently complete enough to type this correctly y = y.squeeze() # type: ignore cols_X = [col for col in col_names if col != target_col] X = df_all[cols_X] if preprocess: labelencoder = LabelEncoder() y = labelencoder.fit_transform(y) if astype == "pandas" and not isinstance(y, pd.Series): y = pd.Series(y, name=target_col) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0) if verbose: print(f"training set shapes: X {X_train.shape}, y {y_train.shape}") print(f"test set shapes: X {X_test.shape}, y {y_test.shape}") if preprocess: from lale.datasets.data_schemas import add_schema X_train = add_schema(X_train.astype(np.number), recalc=True) y_train = add_schema(y_train.astype(np.int), recalc=True) X_test = add_schema(X_test.astype(np.number), recalc=True) y_test = add_schema(y_test.astype(np.int), recalc=True) else: X_train, X_test, y_train, y_test = add_schemas(schema_orig, target_col, X_train, X_test, y_train, y_test) return (X_train, y_train), (X_test, y_test)
def fetch(dataset_name, task_type, verbose=False, preprocess=True, test_size=0.33): if verbose: print('Loading dataset:', dataset_name) #Check that the dataset name exists in experiments_dict try: dataset_name_found = experiments_dict[dataset_name] if experiments_dict[dataset_name]['task_type'] != task_type.lower(): raise ValueError("The task type {} does not match with the given datasets task type {}"\ .format(task_type, experiments_dict[dataset_name]['task_type'])) except KeyError: raise KeyError("Dataset name {} not found in the supported datasets".format(dataset_name)) data_file_name = os.path.join(download_data_dir, dataset_name+".arff") if verbose: print(data_file_name) if not os.path.exists(data_file_name): #TODO: Download the data if not os.path.exists(download_data_dir): os.makedirs(download_data_dir) if verbose: print('created directory {}'.format(download_data_dir)) urllib.request.urlretrieve(experiments_dict[dataset_name]['download_arff_url'], data_file_name) assert os.path.exists(data_file_name) with open(data_file_name) as f: dataDictionary = arff.load(f) f.close() from lale.datasets.data_schemas import liac_arff_to_schema schema_orig = liac_arff_to_schema(dataDictionary) target_col = experiments_dict[dataset_name]['target'] if preprocess: arffData = pd.DataFrame(dataDictionary['data']) #arffData = arffData.fillna(0) attributes = dataDictionary['attributes'] if verbose: print(attributes) categorical_cols = [] numeric_cols = [] X_columns = [] for i, item in enumerate(attributes): if item[0].lower() == target_col: target_indx = i #remove it from attributes so that the next loop indices are adjusted accordingly. del attributes[i] y = arffData.iloc[:,target_indx] arffData = arffData.drop(i, axis = 1) for i, item in enumerate(attributes): X_columns.append(i) if (((isinstance(item[1], str) and item[1].lower() not in numeric_data_types_list) \ or isinstance(item[1], list)) and (item[0].lower() != 'class')): categorical_cols.append(i) elif (isinstance(item[1], str) and item[1].lower() in numeric_data_types_list) and (item[0].lower() != 'class'): numeric_cols.append(i) if verbose: print(f'categorical columns: {categorical_cols}') print(f'numeric columns: {numeric_cols}') X = arffData.iloc[:,X_columns] #Check whether there is any error num_classes_from_last_row = len(list(set(y))) if verbose: print('num_classes_from_last_row', num_classes_from_last_row) transformers1 = [ ( 'imputer_str', SimpleImputer(missing_values=None, strategy='most_frequent'), categorical_cols), ( 'imputer_num', SimpleImputer(strategy='mean'), numeric_cols)] txm1 = ColumnTransformer(transformers1, sparse_threshold=0.0) transformers2 = [ ( 'ohe', OneHotEncoder(sparse=False), list(range(len(categorical_cols)))), ( 'no_op', 'passthrough', list(range(len(categorical_cols), len(categorical_cols) + len(numeric_cols))))] txm2 = ColumnTransformer(transformers2, sparse_threshold=0.0) if verbose: print("Shape of X before preprocessing", X.shape) from sklearn.pipeline import make_pipeline preprocessing = make_pipeline(txm1, txm2) X = preprocessing.fit(X).transform(X) if verbose: print("Shape of X after preprocessing", X.shape) else: col_names = [attr[0].lower() for attr in dataDictionary['attributes']] df_all = pd.DataFrame(dataDictionary['data'], columns=col_names) y = df_all[target_col] y = y.squeeze() cols_X = [col for col in col_names if col != target_col] X = df_all[cols_X] labelencoder = LabelEncoder() y = labelencoder.fit_transform(y) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size = test_size, random_state = 0) if verbose: print(f'training set shapes: X {X_train.shape}, y {y_train.shape}') print(f'test set shapes: X {X_test.shape}, y {y_test.shape}') if preprocess: from lale.datasets.data_schemas import add_schema X_train = add_schema(X_train.astype(np.number), recalc=True) y_train = add_schema(y_train.astype(np.int), recalc=True) X_test = add_schema(X_test.astype(np.number), recalc=True) y_test = add_schema(y_test.astype(np.int), recalc=True) else: X_train, X_test, y_train, y_test = add_schemas( \ schema_orig, target_col, X_train, X_test, y_train, y_test) return (X_train, y_train), (X_test, y_test)