示例#1
0
def get_data(data_path='/adult/dataset_183_adult.csv', continuous_columns = [0, 2, 4, 10, 11, 12], sensitive_attribute = "sex", limit = 1000):

	df = pd.read_csv(Config.get('data_path') + data_path, delimiter=',', header=0)
	y = df['class']
	del df['class']
	X = df
	one_hot = True

	X_train, X_test, y_train, y_test = train_test_split(X.values[0:limit,:], y.values[0:limit], test_size=0.5, random_state=42)

	sensitive_attribute_id = -1
	for c_i in range(len(df.columns)):
		if str(df.columns[c_i]) == sensitive_attribute:
			sensitive_attribute_id = c_i
			break


	categorical_features = list(set(list(range(X_train.shape[1]))) - set(continuous_columns))

	cat_sensitive_attribute_id = -1
	for c_i in range(len(categorical_features)):
		if categorical_features[c_i] == sensitive_attribute_id:
			cat_sensitive_attribute_id = c_i
			break

	xshape = X_train.shape[1]
	if one_hot:
		ct = ColumnTransformer([("onehot", OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_features)])
		scale = ColumnTransformer([("scale", MinMaxScaler(), continuous_columns)])

		pipeline = FeatureUnion([("o", ct),("s", scale)])

		X_train = pipeline.fit_transform(X_train)
		xshape = X_train.shape[1]
		X_test = pipeline.transform(X_test)


	names = ct.get_feature_names()
	for c in continuous_columns:
		names.append(str(X.columns[c]))

	print(names)

	sensitive_ids = []
	all_names = ct.get_feature_names()
	for fname_i in range(len(all_names)):
		if all_names[fname_i].startswith('onehot__x' + str(cat_sensitive_attribute_id) + '_'):
			sensitive_ids.append(fname_i)


	#pickle.dump(names, open("/home/felix/phd/ranking_exeriments/names.p", "wb"))


	le = preprocessing.LabelEncoder()
	le.fit(y_train)
	y_train = le.fit_transform(y_train)
	y_test = le.transform(y_test)

	return X_train, X_test, y_train, y_test, names, sensitive_ids
示例#2
0
    def evaluate_candidates(self, candidates):
        self.preprocessed_folds = []
        for train, test in StratifiedKFold(n_splits=10, random_state=42).split(
                self.dataset.splitted_values['train'], self.current_target):
            self.preprocessed_folds.append((train, test))

        pool = mp.Pool(processes=int(Config.get("parallelism")))
        results = pool.map(self.evaluate_single_candidate, candidates)
        return results
    def read(self):
        openML_path = Config.get('openml.path')

        info_frame = pd.read_csv(openML_path + "/info.csv")

        assert info_frame[info_frame['name'] == self.name][
            'MLType'].values == 'classification', "it is not a classification task"

        #get schema and target
        file = open(openML_path + "/data/" + self.name + "_columns.csv",
                    mode='r')
        json_schema = file.read()
        file.close()
        schema = json.loads(json_schema)

        names = [s['name'] for s in schema]

        self.dataframe = pd.read_csv(
            openML_path + "/data/" + self.name + ".csv", )

        self.target_column_id = np.where(
            self.dataframe.columns == 'target')[0][0]

        # get target
        self.target_values = self.dataframe[self.dataframe.columns[
            self.target_column_id]].values
        self.dataframe.drop(self.dataframe.columns[self.target_column_id],
                            axis=1,
                            inplace=True)

        # get split of the data
        self.splitter.get_splitted_ids(self.dataframe, self.target_values)

        self.splitted_values = {}
        self.splitted_target = {}

        self.splitted_target['train'], self.splitted_target[
            'valid'], self.splitted_target[
                'test'] = self.splitter.materialize_target(self.target_values)
        self.splitted_values['train'], self.splitted_values[
            'valid'], self.splitted_values[
                'test'] = self.splitter.materialize_values(self.dataframe)

        for attribute_i in range(self.dataframe.shape[1]):
            properties = self.derive_properties(
                attribute_i,
                self.dataframe[self.dataframe.columns[attribute_i]].values)
            self.raw_features.append(
                RawFeature(self.dataframe.columns[attribute_i], attribute_i,
                           properties))

        return self.raw_features
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import multiprocessing as mp
import itertools
from sklearn.ensemble import RandomForestRegressor
import scipy.special
import seaborn as sns
import matplotlib.pyplot as plt
from fastsklearnfeature.configuration.Config import Config


#X_train = pd.read_csv(Config.get('data_path') + '/madelon/madelon_train.data', delimiter=' ', header=None).values[:,0:500] [0:100,:]
#y_train = pd.read_csv(Config.get('data_path') + '/madelon/madelon_train.labels', delimiter=' ', header=None).values [0:100]

X_train = pd.read_csv(Config.get('data_path') + '/ARCENE/arcene_train.data', delimiter=' ', header=None).values[:,0:10000][0:100,:]
y_train = pd.read_csv(Config.get('data_path') + '/ARCENE/arcene_train.labels', delimiter=' ', header=None).values[0:100]
data_name = 'ARCENE_sample'



# generate grid
complexity_grid = np.arange(1, X_train.shape[1]+1)
max_acc = 1.0
accuracy_grid = np.arange(0.0, max_acc, max_acc / 100.0)

def get_estimated_runtimes(old_model = "/tmp/model11_hyperopt.p"):

	grid = list(itertools.product(complexity_grid, accuracy_grid))
	meta_X_data = np.matrix(grid)
示例#5
0
'''
data = pd.read_csv(Config.get('data_path') + '/breastTumor/breastTumor.csv', delimiter=',', header=0)
y = data['binaryClass'].values
X = data[data.columns.difference(['binaryClass'])].values
data_name = 'breastTumor'
one_hot = True
'''
'''
data = pd.read_csv(Config.get('data_path') + '/promoters/dataset_106_molecular-biology_promoters.csv', delimiter=',', header=0)
y = data['class'].values
X = data[data.columns.difference(['class', 'instance'])].values
data_name = 'promoters'
one_hot = True
'''

X = pd.read_csv(Config.get('data_path') + '/madelon/madelon_train.data',
                delimiter=' ',
                header=None).values[:, 0:500]
y = pd.read_csv(Config.get('data_path') + '/madelon/madelon_train.labels',
                delimiter=' ',
                header=None).values
data_name = 'madelon'
one_hot = False
'''
X_train = pd.read_csv(Config.get('data_path') + '/ARCENE/arcene_train.data', delimiter=' ', header=None).values[:,0:10000]
y_train = pd.read_csv(Config.get('data_path') + '/ARCENE/arcene_train.labels', delimiter=' ', header=None).values
data_name = 'ARCENE'
one_hot = False
'''

print(X.shape)
示例#6
0
if __name__ == '__main__':
    #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_27_colic_horse.csv", 22)
    #dataset = ("/home/felix/datasets/ExploreKit/csv/phpAmSP4g_cancer.csv", 30)
    #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_29_credit-a_credit.csv", 15)
    #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_37_diabetes_diabetes.csv", 8)

    #dataset = (Config.get('data_path') + "/phpn1jVwe_mammography.csv", 6)
    #dataset = (Config.get('data_path') + "/dataset_23_cmc_contraceptive.csv", 9)
    #dataset = (Config.get('data_path') + "/dataset_31_credit-g_german_credit.csv", 20)
    #dataset = (Config.get('data_path') + '/dataset_53_heart-statlog_heart.csv', 13)
    #dataset = (Config.get('data_path') + '/ILPD.csv', 10)
    #dataset = (Config.get('data_path') + '/iris.data', 4)
    #dataset = (Config.get('data_path') + '/data_banknote_authentication.txt', 4)
    #dataset = (Config.get('data_path') + '/ecoli.data', 8)
    #dataset = (Config.get('data_path') + '/breast-cancer.data', 0)
    dataset = (Config.get('data_path') + '/transfusion.data', 4)
    #dataset = (Config.get('data_path') + '/test_categorical.data', 4)
    #dataset = ('../configuration/resources/data/transfusion.data', 4)
    #dataset = (Config.get('data_path') + '/wine.data', 0)

    #dataset = (Config.get('data_path') + '/house_price.csv', 79)
    #dataset = (Config.get('data_path') + '/synthetic_data.csv', 3)

    from fastsklearnfeature.reader.OnlineOpenMLReader import OnlineOpenMLReader

    from fastsklearnfeature.feature_selection.evaluation.openMLdict import openMLname2task

    # task_id = openMLname2task['transfusion'] #interesting
    # task_id = openMLname2task['iris']
    # task_id = openMLname2task['ecoli']
    # task_id = openMLname2task['breast cancer']
示例#7
0
#y_train = pd.read_csv(Config.get('data_path') + '/madelon/madelon_train.labels', delimiter=' ', header=None).values [0:100]
'''
X_train = pd.read_csv(Config.get('data_path') + '/ARCENE/arcene_train.data', delimiter=' ', header=None).values[:,0:10000]
y_train = pd.read_csv(Config.get('data_path') + '/ARCENE/arcene_train.labels', delimiter=' ', header=None).values
data_name = 'ARCENE'
my_path = "/home/felix/phd/feature_constraints/experiments_arcene/"
onehot = False
'''
'''
data = pd.read_csv(Config.get('data_path') + '/musk/musk.csv', delimiter=',', header=0)
y_train = data['class']
X_train = data[data.columns.difference(['class', 'ID', 'molecule_name', 'conformation_name'])].values
data_name = 'musk'
'''

data = pd.read_csv(Config.get('data_path') + '/breastTumor/breastTumor.csv',
                   delimiter=',',
                   header=0)
y_train = data['binaryClass'].values
X_train = data[data.columns.difference(['binaryClass'])].values
data_name = 'breastTumor'
my_path = "/home/felix/phd/feature_constraints/experiments_actual_tumor/"
onehot = True
'''
data = pd.read_csv(Config.get('data_path') + '/promoters/dataset_106_molecular-biology_promoters.csv', delimiter=',', header=0)
y_train = data['class'].values
X_train = data[data.columns.difference(['class', 'instance'])].values
data_name = 'promoters'
my_path = "/home/felix/phd/feature_constraints/experiments_promoters/"
onehot = True
'''
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        print([r.name for r in self.dataset.raw_features])


        plain_attributes = CandidateFeature(IdentityTransformation(len(self.dataset.raw_features)), self.dataset.raw_features)


        self.evaluate_candidates([plain_attributes])




#statlog_heart.csv=/home/felix/datasets/ExploreKit/csv/dataset_53_heart-statlog_heart.csv
#statlog_heart.target=13

if __name__ == '__main__':
    dataset = (Config.get('statlog_heart.csv'), 13)

    selector = SissoExperiment(dataset)

    selector.run()






示例#9
0
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import chi2

from sklearn.model_selection import cross_val_score
from fastsklearnfeature.interactiveAutoML.fair_measure import true_positive_rate_score
from fastsklearnfeature.interactiveAutoML.new_bench.multiobjective.robust_measure import robust_score
from fastsklearnfeature.interactiveAutoML.new_bench.multiobjective.robust_measure import robust_score_test
import fastsklearnfeature.interactiveAutoML.new_bench.multiobjective.my_global_variable as my_global_variable



sensitive_attribute = "sex"

n_estimators = 5

df = pd.read_csv(Config.get('data_path') + '/adult/dataset_183_adult.csv', delimiter=',', header=0)
y = df['class']
del df['class']
X = df
one_hot = True

limit = 1000

X_train, X_test, y_train, y_test = train_test_split(X.values[0:limit,:], y.values[0:limit], test_size=0.5, random_state=42)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(y_train)
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)
示例#10
0
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import MinMaxScaler
import fastsklearnfeature.interactiveAutoML.feature_selection.WrapperBestK as wrap
from sklearn.ensemble import ExtraTreesClassifier
from hyperopt.fmin import generate_trials_to_calculate
'''
data = pd.read_csv(Config.get('data_path') + '/breastTumor/breastTumor.csv', delimiter=',', header=0)
y = data['binaryClass'].values
X = data[data.columns.difference(['binaryClass'])].values
data_name = 'breastTumor'
one_hot = True
'''

data = pd.read_csv(Config.get('data_path') +
                   '/promoters/dataset_106_molecular-biology_promoters.csv',
                   delimiter=',',
                   header=0)
y = data['class'].values
X = data[data.columns.difference(['class', 'instance'])].values
data_name = 'promoters'
one_hot = True
'''
X_train = pd.read_csv(Config.get('data_path') + '/madelon/madelon_train.data', delimiter=' ', header=None).values[:,0:500]
y_train = pd.read_csv(Config.get('data_path') + '/madelon/madelon_train.labels', delimiter=' ', header=None).values
data_name = 'madelon'
one_hot = False
'''
'''
X_train = pd.read_csv(Config.get('data_path') + '/ARCENE/arcene_train.data', delimiter=' ', header=None).values[:,0:10000]
示例#11
0
number_instances = []
number_attributes = []
number_features = []


def get_class_attribute_name(df):
	for i in range(len(df.columns)):
		if str(df.columns[i]).startswith('class@'):
			return str(df.columns[i])

def get_sensitive_attribute_id(df, sensitive_attribute_name):
	for i in range(len(df.columns)):
		if str(df.columns[i]) == sensitive_attribute_name:
			return i

with open(Config.get('data_path') + "/downloaded_arff/" + "42132.arff") as f:
	df = a2p.load(f)
	print(df.columns)

	print(df.head())


	#df['TotalCharges@REAL'] = pd.to_numeric(df['TotalCharges@STRING'], errors='coerce')
	df = df.drop(columns=['geolocation@STRING'])
	df = df.drop(columns=['seqid@STRING'])
	df = df.drop(columns=['date_of_stop@STRING'])
	df = df.drop(columns=['time_of_stop@STRING'])
	df = df.drop(columns=['description@STRING'])
	df = df.drop(columns=['location@STRING'])

	df.rename(columns={'race@{ASIAN,BLACK,HISPANIC,NATIVE AMERICAN,OTHER,WHITE}': 'race@{BLACKLIVESMATTER,OTHER}'}, inplace=True)
示例#12
0
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import MinMaxScaler
import fastsklearnfeature.interactiveAutoML.feature_selection.WrapperBestK as wrap
from sklearn.ensemble import ExtraTreesClassifier

'''
data = pd.read_csv(Config.get('data_path') + '/breastTumor/breastTumor.csv', delimiter=',', header=0)
y = data['binaryClass'].values
X = data[data.columns.difference(['binaryClass'])].values
data_name = 'breastTumor'
one_hot = True
'''


data = pd.read_csv(Config.get('data_path') + '/promoters/dataset_106_molecular-biology_promoters.csv', delimiter=',', header=0)
y = data['class'].values
X = data[data.columns.difference(['class', 'instance'])].values
data_name = 'promoters'
one_hot = True




'''
X_train = pd.read_csv(Config.get('data_path') + '/madelon/madelon_train.data', delimiter=' ', header=None).values[:,0:500]
y_train = pd.read_csv(Config.get('data_path') + '/madelon/madelon_train.labels', delimiter=' ', header=None).values
data_name = 'madelon'
one_hot = False
'''
示例#13
0
    def run(self):

        self.global_starting_time = time.time()

        # generate all candidates
        self.generate()

        for raw_f in self.raw_features:
            raw_f.properties['type'] = 'float'

        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        myfolds = copy.deepcopy(list(self.preprocessed_folds))

        R_w = 15000
        max_iterations = 15  #15
        threshold_f = 0.001
        epsilon_w = 0.01
        threshold_w = 0.0

        all_features = self.produce_features()

        print(len(all_features))

        self.base_features = CandidateFeature(
            IdentityTransformation(len(self.raw_features)), self.raw_features)

        results = {}

        for i in range(max_iterations):

            print("base features: " + str(self.base_features))

            results[i] = self.evaluate_candidates([self.base_features],
                                                  myfolds)[0]
            print(results[i])
            print(results[i].runtime_properties)

            feature_scores = self.evaluate_ranking(all_features)
            ids = np.argsort(np.array(feature_scores) * -1)
            print(feature_scores)

            best_improvement_so_far = np.NINF
            best_Feature_So_Far = None
            evaluated_candidate_features = 0
            for f_i in range(len(feature_scores)):
                if feature_scores[ids[f_i]] < threshold_f:
                    break

                current_feature_set = CandidateFeature(
                    IdentityTransformation(2),
                    [self.base_features, all_features[ids[f_i]]])
                print(current_feature_set)
                result = self.evaluate_candidates([current_feature_set],
                                                  myfolds)[0]
                evaluated_candidate_features += 1
                improvement = result.runtime_properties['score'] - results[
                    i].runtime_properties['score']

                print("Candidate: " + str(all_features[ids[f_i]]) +
                      " score: " + str(result.runtime_properties['score']) +
                      " info: " + str(feature_scores[ids[f_i]]))
                print("improvement: " + str(improvement))
                if improvement > best_improvement_so_far:
                    best_improvement_so_far = improvement
                    best_Feature_So_Far = result

                    results[i] = best_Feature_So_Far
                    results[i].runtime_properties[
                        'score_improvement'] = improvement
                    results[i].runtime_properties[
                        'info_gain'] = feature_scores[ids[f_i]]
                    results[i].runtime_properties['global time'] = time.time(
                    ) - self.global_starting_time

                    pickle.dump(
                        results,
                        open(
                            Config.get("tmp.folder") + "/explorekit_results.p",
                            "wb"))

                if improvement >= epsilon_w:
                    break
                if evaluated_candidate_features >= R_w:
                    break

            if best_improvement_so_far > threshold_w:
                self.base_features = best_Feature_So_Far
            else:
                return self.base_features

            all_features_new = []
            for i in range(len(feature_scores)):
                if feature_scores[i] >= 0:
                    all_features_new.append(all_features[i])
            all_features = all_features_new
        return results
示例#14
0
 def evaluate_ranking(self, candidates):
     self.preprocessed_folds = []
     pool = mp.Pool(processes=int(Config.get("parallelism")))
     results = pool.map(self.get_info_gain_of_feature, candidates)
     return results
示例#15
0
import openml
from fastsklearnfeature.configuration.Config import Config
import pickle
import numpy as np
import random
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from fastsklearnfeature.configuration.Config import Config
from sklearn import preprocessing
import openml
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import train_test_split

openml.config.apikey = Config.get('openML.apikey')

unique_data = {}

for _, data_info in openml.datasets.list_datasets().items():
    if 'status' in data_info and data_info['status'] == 'active' \
      and 'NumberOfClasses' in data_info and data_info['NumberOfClasses'] == 2 \
      and 'NumberOfInstances' in data_info and data_info['NumberOfInstances'] > 250:

        try:

            dataset = openml.datasets.get_dataset(data_info['did'])
            print(data_info)

            continuous_columns = []
            categorical_features = []
示例#16
0
def get_fair_data1(dataset_key=None):
	map_dataset = {}

	map_dataset['31'] = 'foreign_worker@{yes,no}'
	map_dataset['802'] = 'sex@{female,male}'
	map_dataset['1590'] = 'sex@{Female,Male}'
	map_dataset['1461'] = 'AGE@{True,False}'
	map_dataset['42193'] = 'race_Caucasian@{0,1}'
	map_dataset['1480'] = 'V2@{Female,Male}'
	# map_dataset['804'] = 'Gender@{0,1}'
	map_dataset['42178'] = 'gender@STRING'
	map_dataset['981'] = 'Gender@{Female,Male}'
	map_dataset['40536'] = 'samerace@{0,1}'
	map_dataset['40945'] = 'sex@{female,male}'
	map_dataset['451'] = 'Sex@{female,male}'
	# map_dataset['945'] = 'sex@{female,male}'
	map_dataset['446'] = 'sex@{Female,Male}'
	map_dataset['1017'] = 'sex@{0,1}'
	map_dataset['957'] = 'Sex@{0,1,4}'
	map_dataset['41430'] = 'SEX@{True,False}'
	map_dataset['1240'] = 'sex@{Female,Male}'
	map_dataset['1018'] = 'sex@{Female,Male}'
	# map_dataset['55'] = 'SEX@{male,female}'
	map_dataset['38'] = 'sex@{F,M}'
	map_dataset['1003'] = 'sex@{male,female}'
	map_dataset['934'] = 'race@{black,white}'


	number_instances = []
	number_attributes = []
	number_features = []

	def get_class_attribute_name(df):
		for i in range(len(df.columns)):
			if str(df.columns[i]).startswith('class@'):
				return str(df.columns[i])

	def get_sensitive_attribute_id(df, sensitive_attribute_name):
		for i in range(len(df.columns)):
			if str(df.columns[i]) == sensitive_attribute_name:
				return i

	key = dataset_key
	if type(dataset_key) == type(None):
		key = list(map_dataset.keys())[random.randint(0, len(map_dataset) - 1)]

	value = map_dataset[key]
	with open(Config.get('data_path') + "/downloaded_arff/" + str(key) + ".arff") as f:
		df = a2p.load(f)

		print("dataset: " + str(key))

		number_instances.append(df.shape[0])
		number_attributes.append(df.shape[1])

		y = copy.deepcopy(df[get_class_attribute_name(df)])
		X = df.drop(columns=[get_class_attribute_name(df)])

		categorical_features = []
		continuous_columns = []
		for type_i in range(len(X.columns)):
			if X.dtypes[type_i] == object:
				categorical_features.append(type_i)
			else:
				continuous_columns.append(type_i)

		sensitive_attribute_id = get_sensitive_attribute_id(X, value)

		print(sensitive_attribute_id)

		X_datat = X.values
		for x_i in range(X_datat.shape[0]):
			for y_i in range(X_datat.shape[1]):
				if type(X_datat[x_i][y_i]) == type(None):
					if X.dtypes[y_i] == object:
						X_datat[x_i][y_i] = 'missing'
					else:
						X_datat[x_i][y_i] = np.nan


		X_train, X_test, y_train, y_test = train_test_split(X_datat, y.values.astype('str'), test_size=0.5,
															random_state=42, stratify=y.values.astype('str'))
		'''
		X_train, X_test, y_train, y_test = train_test_split(X_datat[0:200,:], y.values[0:200].astype('str'), test_size=0.5,
															random_state=42, stratify=y.values[0:200].astype('str'))
		'''

		cat_sensitive_attribute_id = -1
		for c_i in range(len(categorical_features)):
			if categorical_features[c_i] == sensitive_attribute_id:
				cat_sensitive_attribute_id = c_i
				break

		my_transformers = []
		if len(categorical_features) > 0:
			ct = ColumnTransformer(
				[("onehot", OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_features)])
			my_transformers.append(("o", ct))
		if len(continuous_columns) > 0:
			scale = ColumnTransformer([("scale", Pipeline(
				[('impute', SimpleImputer(missing_values=np.nan, strategy='mean')), ('scale', MinMaxScaler())]),
										continuous_columns)])
			my_transformers.append(("s", scale))

		pipeline = FeatureUnion(my_transformers)
		pipeline.fit(X_train)
		X_train = pipeline.transform(X_train)
		X_test = pipeline.transform(X_test)

		number_features.append(X_train.shape[1])

		all_columns = []
		for ci in range(len(X.columns)):
			all_columns.append(str(X.columns[ci]).split('@')[0])
		X.columns = all_columns

		names = ct.get_feature_names()
		for c in continuous_columns:
			names.append(str(X.columns[c]))

		for n_i in range(len(names)):
			if names[n_i].startswith('onehot__x'):
				tokens = names[n_i].split('_')
				category = ''
				for ti in range(3, len(tokens)):
					category += '_' + tokens[ti]
				cat_id = int(names[n_i].split('_')[2].split('x')[1])
				names[n_i] = str(X.columns[categorical_features[cat_id]]) + category

		print(names)

		sensitive_ids = []
		all_names = ct.get_feature_names()
		for fname_i in range(len(all_names)):
			if all_names[fname_i].startswith('onehot__x' + str(cat_sensitive_attribute_id) + '_'):
				sensitive_ids.append(fname_i)

		le = preprocessing.LabelEncoder()
		le.fit(y_train)
		y_train = le.fit_transform(y_train)
		y_test = le.transform(y_test)

		return X_train, X_test, y_train, y_test, names, sensitive_ids, key, sensitive_attribute_id
import itertools
from sklearn.ensemble import RandomForestRegressor
import scipy.special
import seaborn as sns
import matplotlib.pyplot as plt
from fastsklearnfeature.configuration.Config import Config

#X_train = pd.read_csv(Config.get('data_path') + '/madelon/madelon_train.data', delimiter=' ', header=None).values[:,0:500] [0:100,:]
#y_train = pd.read_csv(Config.get('data_path') + '/madelon/madelon_train.labels', delimiter=' ', header=None).values [0:100]
'''
X_train = pd.read_csv(Config.get('data_path') + '/ARCENE/arcene_train.data', delimiter=' ', header=None).values[:,0:10000][0:100,:]
y_train = pd.read_csv(Config.get('data_path') + '/ARCENE/arcene_train.labels', delimiter=' ', header=None).values[0:100]
data_name = 'ARCENE_sample'
'''

X_train = pd.read_csv(Config.get('data_path') + '/ARCENE/arcene_train.data',
                      delimiter=' ',
                      header=None).values[:, 0:10000][0:1000, :]
y_train = pd.read_csv(Config.get('data_path') + '/ARCENE/arcene_train.labels',
                      delimiter=' ',
                      header=None).values[0:1000]
data_name = 'ARCENE_sample1k'

# generate grid
complexity_grid = np.arange(1, X_train.shape[1] + 1)
max_acc = 1.0
accuracy_grid = np.arange(0.0, max_acc, max_acc / 100.0)


def get_estimated_runtimes(old_model="/tmp/model11_hyperopt.p"):
示例#18
0
def run_strategy(strategy_method, ranking_id, strategy_id):
    data_infos = pickle.load(
        open(
            Config.get('data_path') + '/openml_data/fitting_datasets.pickle',
            'rb'))

    time_limit = 60 * 20

    meta_classifier = RandomForestRegressor(n_estimators=1000)
    X_train_meta_classifier = []
    y_train_meta_classifier = []

    cv_splitter = StratifiedKFold(5, random_state=42)
    auc_scorer = make_scorer(roc_auc_score,
                             greater_is_better=True,
                             needs_threshold=True)

    acc_value_list = []
    fair_value_list = []
    robust_value_list = []
    success_value_list = []
    runtime_value_list = []
    dataset_did_list = []
    dataset_sensitive_attribute_list = []

    while True:
        X_train, X_test, y_train, y_test, names, sensitive_ids, data_did, sensitive_attribute_id = get_data_openml(
            data_infos)

        #run on tiny sample
        X_train_tiny, _, y_train_tiny, _ = train_test_split(X_train,
                                                            y_train,
                                                            train_size=100,
                                                            random_state=42,
                                                            stratify=y_train)

        fair_train_tiny = make_scorer(
            true_positive_rate_score,
            greater_is_better=True,
            sensitive_data=X_train_tiny[:, sensitive_ids[0]])

        def objective(hps):
            print(hps)

            try:

                cv_k = 1.0
                cv_privacy = hps['privacy']
                model = LogisticRegression()
                if type(cv_privacy) == type(None):
                    cv_privacy = X_train_tiny.shape[0]
                else:
                    model = models.LogisticRegression(epsilon=cv_privacy)

                robust_scorer = make_scorer(robust_score,
                                            greater_is_better=True,
                                            X=X_train_tiny,
                                            y=y_train_tiny,
                                            model=model,
                                            feature_selector=None,
                                            scorer=auc_scorer)

                cv = GridSearchCV(model,
                                  param_grid={'C': [1.0]},
                                  scoring={
                                      'AUC': auc_scorer,
                                      'Fairness': fair_train_tiny,
                                      'Robustness': robust_scorer
                                  },
                                  refit=False,
                                  cv=cv_splitter)
                cv.fit(X_train_tiny, pd.DataFrame(y_train_tiny))
                cv_acc = cv.cv_results_['mean_test_AUC'][0]
                cv_fair = 1.0 - cv.cv_results_['mean_test_Fairness'][0]
                cv_robust = 1.0 - cv.cv_results_['mean_test_Robustness'][0]

                small_start_time = time.time()

                cv = GridSearchCV(model,
                                  param_grid={'C': [1.0]},
                                  scoring={
                                      'AUC': auc_scorer,
                                      'Fairness': fair_train_tiny,
                                      'Robustness': robust_scorer
                                  },
                                  refit=False,
                                  cv=cv_splitter)
                cv.fit(X_train_tiny, pd.DataFrame(y_train_tiny))
                cv_acc = cv.cv_results_['mean_test_AUC'][0]
                cv_fair = 1.0 - cv.cv_results_['mean_test_Fairness'][0]
                cv_robust = 1.0 - cv.cv_results_['mean_test_Robustness'][0]

                # construct feature vector
                feature_list = []
                # user-specified constraints
                feature_list.append(hps['accuracy'])
                feature_list.append(hps['fairness'])
                feature_list.append(hps['k'])
                feature_list.append(hps['k'] * X_train.shape[1])
                feature_list.append(hps['robustness'])
                feature_list.append(cv_privacy)
                feature_list.append(hps['search_time'])
                # differences to sample performance
                feature_list.append(cv_acc - hps['accuracy'])
                feature_list.append(cv_fair - hps['fairness'])
                feature_list.append(cv_k - hps['k'])
                feature_list.append((cv_k - hps['k']) * X_train.shape[1])
                feature_list.append(cv_robust - hps['robustness'])
                feature_list.append(time.time() - small_start_time)
                # privacy constraint is always satisfied => difference always zero => constant => unnecessary

                # metadata features
                feature_list.append(X_train.shape[0])  # number rows
                feature_list.append(X_train.shape[1])  # number columns

                features = np.array(feature_list)

                #predict the best model and calculate uncertainty

                loss = 0
                if hasattr(meta_classifier, 'estimators_'):
                    predictions = []
                    for tree in range(len(meta_classifier.estimators_)):
                        predictions.append(
                            meta_classifier.estimators_[tree].predict(
                                [features])[0])

                    stddev = np.std(np.array(predictions), axis=0)
                    print('stddev: ' + str(stddev))

                    loss = (stddev**2) * -1

                return {
                    'loss': loss,
                    'status': STATUS_OK,
                    'features': features
                }
            except:
                return {'loss': np.inf, 'status': STATUS_OK}

        space = {
            'k':
            hp.choice('k_choice', [(1.0), (hp.uniform('k_specified', 0, 1))]),
            'accuracy':
            hp.uniform('accuracy_specified', 0.5, 1),
            'fairness':
            hp.choice('fairness_choice',
                      [(0.0), (hp.uniform('fairness_specified', 0, 1))]),
            'privacy':
            hp.choice('privacy_choice',
                      [(None), (hp.lognormal('privacy_specified', 0, 1))]),
            'robustness':
            hp.choice('robustness_choice',
                      [(0.0), (hp.uniform('robustness_specified', 0, 1))]),
            'search_time':
            hp.uniform('search_time_specified', 10, time_limit),  # in seconds
        }

        trials = Trials()
        runs_per_dataset = 0
        i = 1
        while True:
            fmin(objective,
                 space=space,
                 algo=tpe.suggest,
                 max_evals=i,
                 trials=trials)
            i += 1

            if trials.trials[-1]['result']['loss'] == np.inf:
                break

            #break, once convergence tolerance is reached and generate new dataset
            if trials.trials[-1]['result']['loss'] == 0 or i % 20 == 0:
                best_trial = trials.trials[-1]
                if i % 20 == 0:
                    best_trial = trials.best_trial
                most_uncertain_f = best_trial['misc']['vals']
                #print(most_uncertain_f)

                min_accuracy = most_uncertain_f['accuracy_specified'][0]
                min_fairness = 0.0
                if most_uncertain_f['fairness_choice'][0]:
                    min_fairness = most_uncertain_f['fairness_specified'][0]
                min_robustness = 0.0
                if most_uncertain_f['robustness_choice'][0]:
                    min_robustness = most_uncertain_f['robustness_specified'][
                        0]
                max_number_features = X_train.shape[1]
                if most_uncertain_f['k_choice'][0]:
                    max_number_features = most_uncertain_f['k_specified'][0]

                max_search_time = most_uncertain_f['search_time_specified'][0]

                # Execute each search strategy with a given time limit (in parallel)
                # maybe run multiple times to smooth stochasticity

                model = LogisticRegression()
                if most_uncertain_f['privacy_choice'][0]:
                    model = models.LogisticRegression(
                        epsilon=most_uncertain_f['privacy_specified'][0])

                rankings = [variance, chi2_score_wo]  # simple rankings
                rankings.append(
                    partial(model_score,
                            estimator=ExtraTreesClassifier(
                                n_estimators=1000)))  # accuracy ranking
                rankings.append(
                    partial(robustness_score, model=model,
                            scorer=auc_scorer))  # robustness ranking
                rankings.append(
                    partial(fairness_score,
                            estimator=ExtraTreesClassifier(n_estimators=1000),
                            sensitive_ids=sensitive_ids))  # fairness ranking

                selected_rankings = rankings
                if type(ranking_id) != type(None):
                    selected_rankings = [rankings[ranking_id]]

                result = strategy_method(
                    X_train,
                    X_test,
                    y_train,
                    y_test,
                    names,
                    sensitive_ids,
                    ranking_functions=selected_rankings,
                    clf=model,
                    min_accuracy=min_accuracy,
                    min_fairness=min_fairness,
                    min_robustness=min_robustness,
                    max_number_features=max_number_features,
                    max_search_time=max_search_time,
                    cv_splitter=cv_splitter)

                # append ml data
                X_train_meta_classifier.append(
                    best_trial['result']['features'])
                y_train_meta_classifier.append(result['time'])

                try:
                    meta_classifier.fit(np.array(X_train_meta_classifier),
                                        y_train_meta_classifier)
                except:
                    pass

                #pickle everything and store it
                one_big_object = {}
                one_big_object['features'] = X_train_meta_classifier
                #one_big_object['best_strategy'] = y_train_meta_classifier

                runtime_value_list.append(result['time'])
                acc_value_list.append(result['cv_acc'])
                fair_value_list.append(result['cv_fair'])
                robust_value_list.append(result['cv_robust'])
                success_value_list.append(result['success'])

                dataset_did_list.append(data_did)
                dataset_sensitive_attribute_list.append(sensitive_attribute_id)

                one_big_object['times_value'] = runtime_value_list
                one_big_object['acc_value'] = acc_value_list
                one_big_object['fair_value'] = fair_value_list
                one_big_object['robust_value'] = robust_value_list
                one_big_object['success_value'] = success_value_list
                one_big_object['dataset_id'] = dataset_did_list
                one_big_object[
                    'sensitive_attribute_id'] = dataset_sensitive_attribute_list

                pickle.dump(
                    one_big_object,
                    open(
                        '/tmp/metalearning_data' + str(strategy_id) +
                        '.pickle', 'wb'))

                trials = Trials()
                i = 1
                runs_per_dataset += 1
                break
from fastsklearnfeature.interactiveAutoML.new_bench.multiobjective.metalearning.strategies.exhaustive import exhaustive
from fastsklearnfeature.interactiveAutoML.new_bench.multiobjective.metalearning.strategies.forward_floating_selection import forward_selection
from fastsklearnfeature.interactiveAutoML.new_bench.multiobjective.metalearning.strategies.backward_floating_selection import backward_selection
from fastsklearnfeature.interactiveAutoML.new_bench.multiobjective.metalearning.strategies.forward_floating_selection import forward_floating_selection
from fastsklearnfeature.interactiveAutoML.new_bench.multiobjective.metalearning.strategies.backward_floating_selection import backward_floating_selection
from fastsklearnfeature.interactiveAutoML.new_bench.multiobjective.metalearning.strategies.recursive_feature_elimination import recursive_feature_elimination


#static constraints: fairness, number of features (absolute and relative), robustness, privacy, accuracy

from fastsklearnfeature.interactiveAutoML.new_bench.multiobjective.bench_utils import get_fair_data1
from concurrent.futures import TimeoutError
from pebble import ProcessPool, ProcessExpired

#load list of viable datasets
data_infos = pickle.load(open(Config.get('data_path') + '/openml_data/fitting_datasets.pickle', 'rb'))

current_run_time_id = time.time()

time_limit = 60 * 60 * 3
n_jobs = 20
number_of_runs = 1

X_train_meta_classifier = []
y_train_meta_classifier = []

ranking_scores_info = []


acc_value_list = []
fair_value_list = []
示例#20
0
    for i in range(len(df.columns)):
        if str(df.columns[i]).startswith('class@'):
            return str(df.columns[i])


def get_sensitive_attribute_id(df, sensitive_attribute_name):
    for i in range(len(df.columns)):
        if str(df.columns[i]) == sensitive_attribute_name:
            return i


random_number = 42

for key, value in map_dataset.items():
    with open(
            Config.get('data_path') + "/downloaded_arff/" + str(key) +
            ".arff") as f:
        df = a2p.load(f)

        print("dataset: " + str(key))

        name_dataset = openml.datasets.get_dataset(dataset_id=int(key),
                                                   download_data=False).name

        number_instances = df.shape[0]
        number_attributes = df.shape[1]

        y = copy.deepcopy(df[get_class_attribute_name(df)])
        X = df.drop(columns=[get_class_attribute_name(df)])

        categorical_features = []
示例#21
0
    hp.uniform('informative_specified', 0, 1),
    'n_redundant':
    hp.uniform('redundant_specified', 0, 1),
    'n_repeated':
    hp.uniform('repeated_specified', 0, 1),
    'n_useless':
    hp.uniform('useless_specified', 0, 1),
    'n_clusters_per_class':
    hp.randint('clusters_specified', 1, 10),
}

configurations = []
try:
    configurations = pickle.load(
        open(
            Config.get('data_path') +
            "/scaling_configurations_samples/scaling_configurations_models.pickle",
            "rb"))
except:
    while len(configurations) < 100:
        my_config = hyperopt.pyll.stochastic.sample(space)
        try:
            generate_data(100, 50, my_config, 0)
            configurations.append(my_config)
        except:
            continue

    pickle.dump(
        configurations,
        open(
            Config.get('data_path') +
示例#22
0
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

#static constraints: fairness, number of features (absolute and relative), robustness, privacy, accuracy

from fastsklearnfeature.interactiveAutoML.new_bench.multiobjective.bench_utils import get_fair_data1
from concurrent.futures import TimeoutError
from pebble import ProcessPool, ProcessExpired
from arff2pandas import a2p

#load list of viable datasets
data_infos = pickle.load(
    open(
        Config.get('data_path') + '/openml_data/fitting_datasets.pickle',
        'rb'))

current_run_time_id = time.time()

time_limit = 60 * 60 * 3
n_jobs = 20
number_of_runs = 1

X_train_meta_classifier = []
y_train_meta_classifier = []

ranking_scores_info = []

acc_value_list = []
fair_value_list = []
            print("(" + str(r + 1) + "," + str(results[r]['test_score']) + ")")

        #new_scores = [r['test_score'] for r in results]
        #best_id = np.argmax(new_scores)

        #print(results[best_id])


#statlog_heart.csv=/home/felix/datasets/ExploreKit/csv/dataset_53_heart-statlog_heart.csv
#statlog_heart.target=13

if __name__ == '__main__':
    #dataset = (Config.get('statlog_heart.csv'), int(Config.get('statlog_heart.target')))
    #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_27_colic_horse.csv", 22)
    #dataset = ("/home/felix/datasets/ExploreKit/csv/phpAmSP4g_cancer.csv", 30)
    # dataset = ("/home/felix/datasets/ExploreKit/csv/phpOJxGL9_indianliver.csv", 10)
    # dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_29_credit-a_credit.csv", 15)
    #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_37_diabetes_diabetes.csv", 8)
    # dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_31_credit-g_german_credit.csv", 20)
    # dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_23_cmc_contraceptive.csv", 9)
    # dataset = ("/home/felix/datasets/ExploreKit/csv/phpn1jVwe_mammography.csv", 6)

    dataset = (Config.get('transfusion.csv'), 4)

    selector = ExploreKitSelection_iterative_search(dataset)
    #selector = ExploreKitSelection(dataset, KNeighborsClassifier(), {'n_neighbors': np.arange(3,10), 'weights': ['uniform','distance'], 'metric': ['minkowski','euclidean','manhattan']})

    results = selector.run()

    pickle.dump(results, open("/tmp/all_data_iterations.p", "wb"))
示例#24
0
from sklearn.metrics import precision_score
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import multiprocessing as mp
import itertools
from sklearn.ensemble import RandomForestRegressor
import scipy.special
import seaborn as sns
import matplotlib.pyplot as plt
from fastsklearnfeature.configuration.Config import Config


X_train = pd.read_csv(Config.get('data_path') + '/madelon/madelon_train.data', delimiter=' ', header=None).values[:,0:500] [0:100,:]
y_train = pd.read_csv(Config.get('data_path') + '/madelon/madelon_train.labels', delimiter=' ', header=None).values [0:100]


# generate grid
complexity_grid = np.arange(1, X_train.shape[1]+1)
max_acc = 1.0
accuracy_grid = np.arange(0.0, max_acc, max_acc / len(complexity_grid))

def get_estimated_runtimes(old_model = "/tmp/model11_hyperopt.p"):

	grid = list(itertools.product(complexity_grid, accuracy_grid))
	meta_X_data = np.matrix(grid)

	al_model = pickle.load(open(old_model, "rb"))
	runtime_predictions = al_model.predict(meta_X_data)
示例#25
0
 def filter_failing_in_parallel(self):
     pool = mp.Pool(processes=int(Config.get("parallelism")))
     results = pool.map(self.filter_candidate, self.candidates)
     return list(itertools.chain(*results))
示例#26
0
                if rep != None:
                    all_representations.append(rep)
                    all_seen_representations.append(rep)



if __name__ == '__main__':
    #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_27_colic_horse.csv", 22)
    #dataset = ("/home/felix/datasets/ExploreKit/csv/phpAmSP4g_cancer.csv", 30)
    #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_29_credit-a_credit.csv", 15)
    #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_37_diabetes_diabetes.csv", 8)

    #dataset = (Config.get('data_path') + "/phpn1jVwe_mammography.csv", 6)
    #dataset = (Config.get('data_path') + "/dataset_23_cmc_contraceptive.csv", 9)
    #dataset = (Config.get('data_path') + "/dataset_31_credit-g_german_credit.csv", 20)
    dataset = (Config.get('data_path') + '/dataset_53_heart-statlog_heart.csv', 13)
    #dataset = (Config.get('data_path') + '/ILPD.csv', 10)
    #dataset = (Config.get('data_path') + '/iris.data', 4)
    #dataset = (Config.get('data_path') + '/data_banknote_authentication.txt', 4)
    #dataset = (Config.get('data_path') + '/ecoli.data', 8)
    #dataset = (Config.get('data_path') + '/breast-cancer.data', 0)
    #dataset = (Config.get('data_path') + '/transfusion.data', 4)
    #dataset = (Config.get('data_path') + '/test_categorical.data', 4)
    #dataset = ('../configuration/resources/data/transfusion.data', 4)
    #dataset = (Config.get('data_path') + '/wine.data', 0)

    #dataset = (Config.get('data_path') + '/house_price.csv', 79)
    #dataset = (Config.get('data_path') + '/synthetic_data.csv', 3)


示例#27
0
        start_time = time.time()

        results = self.evaluate_candidates(
            np.array(self.candidates)[ranked_selected_candidate_ids])

        print("evaluation time: " + str((time.time() - start_time) / 60) +
              " min")

        return start_score, results, ranked_selected_candidate_ids


#statlog_heart.csv=/home/felix/datasets/ExploreKit/csv/dataset_53_heart-statlog_heart.csv
#statlog_heart.target=13

if __name__ == '__main__':
    dataset = (Config.get('statlog_heart.csv'),
               int(Config.get('statlog_heart.target')))
    #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_27_colic_horse.csv", 22)
    #dataset = ("/home/felix/datasets/ExploreKit/csv/phpAmSP4g_cancer.csv", 30)
    # dataset = ("/home/felix/datasets/ExploreKit/csv/phpOJxGL9_indianliver.csv", 10)
    # dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_29_credit-a_credit.csv", 15)
    #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_37_diabetes_diabetes.csv", 8)
    # dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_31_credit-g_german_credit.csv", 20)
    # dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_23_cmc_contraceptive.csv", 9)
    # dataset = ("/home/felix/datasets/ExploreKit/csv/phpn1jVwe_mammography.csv", 6)

    selector = ExploreKitSelection(dataset)
    #selector = ExploreKitSelection(dataset, KNeighborsClassifier(), {'n_neighbors': np.arange(3,10), 'weights': ['uniform','distance'], 'metric': ['minkowski','euclidean','manhattan']})

    start_score, results, ids = selector.run()
示例#28
0
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import multiprocessing as mp
import itertools
from sklearn.ensemble import RandomForestRegressor
import scipy.special
import seaborn as sns
import matplotlib.pyplot as plt
from fastsklearnfeature.configuration.Config import Config

X_train = pd.read_csv(Config.get('data_path') + '/madelon/madelon_train.data',
                      delimiter=' ',
                      header=None).values[:, 0:500][0:100, :]
y_train = pd.read_csv(Config.get('data_path') +
                      '/madelon/madelon_train.labels',
                      delimiter=' ',
                      header=None).values[0:100]

name = 'hyperopt'

# generate grid
complexity_grid = np.arange(1, X_train.shape[1] + 1)
max_acc = 0.8
accuracy_grid = np.arange(0.0, max_acc, max_acc / len(complexity_grid))

示例#29
0
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import chi2
from sklearn.tree import DecisionTreeClassifier

from fastsklearnfeature.configuration.Config import Config

from skrebate import ReliefF
from fastsklearnfeature.interactiveAutoML.feature_selection.fcbf_package import my_fisher_score

sensitive_attribute = "sex"

n_estimators = 5

df = pd.read_csv(Config.get('data_path') + '/adult/dataset_183_adult.csv',
                 delimiter=',',
                 header=0)
y = df['class']
del df['class']
X = df
one_hot = True

limit = 1000

X_train, X_test, y_train, y_test = train_test_split(X.values[0:limit, :],
                                                    y.values[0:limit],
                                                    test_size=0.5,
                                                    random_state=42)
xshape = X_train.shape[1]
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
示例#30
0
文件: Reader.py 项目: BigDaMa/DFS
        self.splitted_values = {}
        self.splitted_target= {}

        self.splitted_target['train'], self.splitted_target['valid'], self.splitted_target['test'] = self.splitter.materialize_target(self.target_values)
        self.splitted_values['train'], self.splitted_values['valid'],self.splitted_values['test'] = self.splitter.materialize_values(self.dataframe)

        for attribute_i in range(self.dataframe.shape[1]):
            rf = RawFeature(self.dataframe.columns[attribute_i], attribute_i, {})
            rf.derive_properties(self.dataframe[self.dataframe.columns[attribute_i]].values)
            self.raw_features.append(rf)


        return self.raw_features





if __name__ == '__main__':
    from fastsklearnfeature.splitting.RandomSplitter import RandomSplitter
    from fastsklearnfeature.configuration.Config import Config

    s = RandomSplitter()

    dataset = (Config.get('data_path') + '/house_price.csv', 79)
    r = Reader(dataset[0], dataset[1], s)
    r.read()

    for rf in r.raw_features:
        print(str(rf) + ": " + str(rf.properties))