def get_data(data_id, randomstate=42): dataset = openml.datasets.get_dataset(dataset_id=data_id) X, y, categorical_indicator, attribute_names = dataset.get_data( dataset_format="array", target=dataset.default_target_attribute) X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( X, y, random_state=randomstate, stratify=y, train_size=0.6) calculate_all_metafeatures_with_labels(X_train, y_train, categorical=categorical_indicator, dataset_name='data')
def _calculate_metafeatures(data_feat_type, data_info_task, basename, x_train, y_train, watcher, logger): # == Calculate metafeatures task_name = 'CalculateMetafeatures' watcher.start_task(task_name) categorical = [True if feat_type.lower() in ['categorical'] else False for feat_type in data_feat_type] EXCLUDE_META_FEATURES = EXCLUDE_META_FEATURES_CLASSIFICATION \ if data_info_task in CLASSIFICATION_TASKS else EXCLUDE_META_FEATURES_REGRESSION if data_info_task in [MULTICLASS_CLASSIFICATION, BINARY_CLASSIFICATION, MULTILABEL_CLASSIFICATION, REGRESSION]: logger.info('Start calculating metafeatures for %s', basename) result = calculate_all_metafeatures_with_labels( x_train, y_train, categorical=categorical, dataset_name=basename, dont_calculate=EXCLUDE_META_FEATURES, ) for key in list(result.metafeature_values.keys()): if result.metafeature_values[key].type_ != 'METAFEATURE': del result.metafeature_values[key] else: result = None logger.info('Metafeatures not calculated') watcher.stop_task(task_name) logger.info( 'Calculating Metafeatures (categorical attributes) took %5.2f', watcher.wall_elapsed(task_name)) return result
def _calculate_metafeatures__(data_feat_type, data_info_task, basename, x_train, y_train): # == Calculate metafeatures task_name = 'CalculateMetafeatures' categorical = [ True if feat_type.lower() in ['categorical'] else False for feat_type in data_feat_type ] EXCLUDE_META_FEATURES = EXCLUDE_META_FEATURES_CLASSIFICATION \ if data_info_task in CLASSIFICATION_TASKS else EXCLUDE_META_FEATURES_REGRESSION if data_info_task in [ MULTICLASS_CLASSIFICATION, BINARY_CLASSIFICATION, MULTILABEL_CLASSIFICATION, REGRESSION ]: result = calculate_all_metafeatures_with_labels( x_train, y_train, categorical=categorical, dataset_name=basename, dont_calculate=EXCLUDE_META_FEATURES, ) for key in list(result.metafeature_values.keys()): if result.metafeature_values[key].type_ != 'METAFEATURE': del result.metafeature_values[key] else: result = None return result
def data2features(X_train, y_train, categorical_indicator): metafeatures = calculate_all_metafeatures_with_labels(X_train, y_train, categorical=categorical_indicator, dataset_name='data') metafeature_values = np.zeros((1, len(metafeature_names_new))) for m_i in range(len(metafeature_names_new)): try: metafeature_values[0, m_i] = metafeatures[metafeature_names_new[m_i]].value except: pass return metafeature_values
def calc_meta_features(X_train, Y_train, categorical, dataset_name): """ Calculate meta features with label :param X_train: :param Y_train: :param categorical: :param dataset_name: :return: """ return calculate_all_metafeatures_with_labels( X_train, Y_train, categorical, dataset_name + SENTINEL, dont_calculate=EXCLUDE_META_FUTURES)
def calculate_metafeatures(task_id): print(task_id) X_train, y_train, X_test, y_test, cat = load_task(task_id) categorical = [True if 'categorical' == c else False for c in cat] _metafeatures_labels = metafeatures.calculate_all_metafeatures_with_labels( X_train, y_train, [False] * X_train.shape[1], task_id) X_train, sparse = perform_one_hot_encoding(scipy.sparse.issparse(X_train), categorical, [X_train]) X_train = X_train[0] categorical = [False] * X_train.shape[1] start_time = time.time() obj = pynisher.enforce_limits(mem_in_mb=3072)( metafeatures.calculate_all_metafeatures_encoded_labels) _metafeatures_encoded_labels = obj(X_train, y_train, categorical, task_id) end_time = time.time() if obj.exit_status == pynisher.MemorylimitException: # During the conversion of the dataset (rescaling, etc...), it can # happen that we run out of memory. _metafeatures_encoded_labels = \ metafeature.DatasetMetafeatures(task_id, dict()) metafeature_calculation_time = (end_time - start_time) / \ len(metafeatures.npy_metafeatures) for metafeature_name in metafeatures.npy_metafeatures: type_ = "HELPERFUNCTION" if metafeature_name not in \ metafeatures.metafeatures.functions \ else "METAFEATURE" _metafeatures_encoded_labels.metafeature_values[metafeature_name] = \ metafeature.MetaFeatureValue(metafeature_name, type_, 0, 0, np.NaN, metafeature_calculation_time, "Memory error during dataset scaling.") mf = _metafeatures_labels mf.metafeature_values.update( _metafeatures_encoded_labels.metafeature_values) return mf
def calculate_metafeatures(profile, basename, x_train, y_train): is_class = profile.has_categorical_target() pf = profile.get_raw_profile() categorical = pf.loc[~pf['drop'] & ~pf['target'] & ~(pf['col_type'] == 'datetime'), 'is_cat'].values if is_class: EXCLUDE_META_FEATURES = EXCLUDE_META_FEATURES_CLASSIFICATION else: EXCLUDE_META_FEATURES = EXCLUDE_META_FEATURES_REGRESSION logger.info('Start calculating metafeatures') result = calculate_all_metafeatures_with_labels( x_train, y_train, categorical=categorical, dataset_name=basename, dont_calculate=EXCLUDE_META_FEATURES, ) for key in list(result.metafeature_values.keys()): if result.metafeature_values[key].type_ != 'METAFEATURE': del result.metafeature_values[key] return result
def calculate_metafeatures(self, data_manager, dataset_name): """ A function to calculate the dataset's meta features internally called Auto-SKLearn's caclulate_all_metafeatures_with_labels() and stores the returned DatasetMetaFeatures Object """ categorical = [ True if feat_type.lower() in ['categorical'] else False for feat_type in data_manager.feat_type ] EXCLUDE_META_FEATURES = EXCLUDE_META_FEATURES_CLASSIFICATION \ if data_manager.info['task'] in ask_const.CLASSIFICATION_TASKS else EXCLUDE_META_FEATURES_REGRESSION if data_manager.info['task'] in [ ask_const.MULTICLASS_CLASSIFICATION, ask_const.BINARY_CLASSIFICATION, ask_const.MULTILABEL_CLASSIFICATION, ask_const.REGRESSION ]: result = calculate_all_metafeatures_with_labels( data_manager.data['X_train'], data_manager.data['Y_train'], categorical=categorical, dataset_name=dataset_name, dont_calculate=EXCLUDE_META_FEATURES, ) for key in list(result.metafeature_values.keys()): if result.metafeature_values[key].type_ != 'METAFEATURE': del result.metafeature_values[key] else: result = None return result
res = suggest_via_metalearning(meta_base,'198_a_metric',metric,task,False,1) print(res) print(type(res)) print(len(res)) from autosklearn.metalearning.metafeatures.metafeatures import \ calculate_all_metafeatures_with_labels, \ calculate_all_metafeatures_encoded_labels, subsets X_train, Y_train, X_test, Y_test = get_dataset(dataset_name) print(Y_train) categorical = [False] * X_train.shape[1] meta_features_label = calculate_all_metafeatures_with_labels( X_train, Y_train, categorical, dataset_name) print(meta_features_label) meta_features_encoded_label = calculate_all_metafeatures_encoded_labels( X_train, Y_train, categorical, dataset_name) print(meta_features_encoded_label) #configuration_space = get_configuration_space( # { # 'metric': metric, # 'task': task, # 'is_sparse': False # }, #include_preprocessors=['no_preprocessing']) #X_train, Y_train, X_test, Y_test = get_dataset(dataset_name) #categorical = [False] * X_train.shape[1]