Exemplo n.º 1
0
def get_transformation_for_division(train_X_all, raw_features):

    unary_transformations: List[UnaryTransformation] = []
    binary_transformations: List[Transformation] = []

    #unary_transformations.append(PandasDiscretizerTransformation(number_bins=10))
    unary_transformations.append(MinMaxScalingTransformation())
    unary_transformations.append(MDLPDiscretizerTransformation())

    unary_transformations.append(OneDivisionTransformation())
    unary_transformations.append(MinusTransformation())

    unary_transformations.append(ImputationTransformation('mean'))
    #unary_transformations.append(ImputationTransformation('median'))
    #unary_transformations.append(ImputationTransformation('most_frequent'))

    binary_transformations.extend(
        HigherOrderCommutativeClassGenerator(
            2,
            methods=[np.nansum, np.nanprod],
            sympy_methods=[sympy.Add, sympy.Mul]).produce())

    binary_transformations.extend(
        GroupByThenGenerator(
            2,
            methods=[np.nanmax, np.nanmin, np.nanmean, np.nanstd],
            sympy_methods=[
                groupbythenmax, groupbythenmin, groupbythenmean, groupbythenstd
            ]).produce())

    unary_transformations.extend(
        OneHotGenerator(train_X_all, raw_features).produce())

    return unary_transformations, binary_transformations
Exemplo n.º 2
0
    def fit(self, X, y=None):
        fe = ComplexityDrivenFeatureConstruction(
            None,
            reader=ScikitReader(
                X,
                y,
                feature_names=self.feature_names,
                feature_is_categorical=self.feature_is_categorical),
            score=self.scoring,
            c_max=self.c_max,
            folds=self.cv,
            max_seconds=self.max_time_secs,
            classifier=self.model.__class__,
            grid_search_parameters=self.parameter_grid,
            n_jobs=self.n_jobs,
            epsilon=self.epsilon,
            remove_parents=False,
            transformation_producer=self.transformation_producer)

        fe.run()

        numeric_representations = []
        for r in fe.all_representations:
            if 'score' in r.runtime_properties:
                if not 'object' in str(r.properties['type']):
                    if not isinstance(r.transformation,
                                      MinMaxScalingTransformation):
                        #if not (isinstance(r.transformation, HigherOrderCommutativeTransformation) and r.transformation.method == np.nansum):
                        if isinstance(r.sympy_representation, sympy.Mul):
                            found = False
                            for e in r.sympy_representation._args:
                                if e == S.NegativeOne:
                                    found = True
                            if found == False:
                                numeric_representations.append(r)
                        else:
                            numeric_representations.append(r)

        self.numeric_features = numeric_representations

        my_list = []
        for ff in self.numeric_features:
            my_list.append(str(ff))

        with open('/tmp/names.pickle', 'wb') as f:
            pickle.dump(X, f, pickle.HIGHEST_PROTOCOL)

        all_features = CandidateFeature(IdentityTransformation(-1),
                                        numeric_representations)

        #all_imputation = CandidateFeature(ImputationTransformation(), [all_features])
        all_standardized = CandidateFeature(MinMaxScalingTransformation(),
                                            [all_features])

        #all_standardized = CandidateFeature(MinMaxScalingTransformation(), [all_features])

        self.pipeline_ = all_standardized.pipeline

        self.pipeline_.fit(X, y)
        return self
Exemplo n.º 3
0
    def run(self):
        self.global_starting_time = time.time()

        # generate all candidates
        self.generate(42)
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        myfolds = copy.deepcopy(list(self.preprocessed_folds))

        level_scores: Dict[int, List[float]] = {}
        level_test_scores: Dict[int, List[float]] = {}

        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/eucalyptus')
        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/contraceptive')
        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/diabetes')
        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/credit')
        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/heart_new_all')
        #string2candidate = self.load_data_all('/tmp')

        baseline_features: List[CandidateFeature] = []
        for r in self.raw_features:
            if r.is_numeric() and not r.properties['categorical']:
                if not r.properties['missing_values']:
                    baseline_features.append(r)
                else:
                    baseline_features.append(
                        CandidateFeature(ImputationTransformation(), [r]))
            else:
                baseline_features.extend([
                    CandidateFeature(t, [r])
                    for t in OneHotGenerator(self.train_X_all, [r]).produce()
                ])

        #baseline_features.extend(self.get_interesting_features('/home/felix/phd/fastfeatures/results/heart_small', 24))
        #baseline_features.extend(self.get_interesting_features('/home/felix/phd/fastfeatures/results/heart_new_all', 10))
        #baseline_features.extend(self.get_interesting_features(string2candidate, 2))
        '''
        for c in baseline_features:
            if isinstance(c, RawFeature):
                print(str(c) + " complexity: " + str(c.get_complexity()))
            else:
                print('nr: ' + str(c) + " complexity: " + str(c.get_complexity()))+
        '''

        # standardize
        scaled_baseline_features = []
        for c in baseline_features:
            scaled_baseline_features.append(
                CandidateFeature(MinMaxScalingTransformation(), [c]))

        #scaled_baseline_features = baseline_features

        combo = CandidateFeature(
            IdentityTransformation(len(baseline_features)),
            scaled_baseline_features)

        results = self.evaluate_candidates_detail([combo], myfolds, 1)

        print(str(results[0].runtime_properties))
Exemplo n.º 4
0
def get_transformation_for_cat_feature_space(train_X_all, raw_features):

    unary_transformations: List[UnaryTransformation] = []
    binary_transformations: List[Transformation] = []

    unary_transformations.append(
        PandasDiscretizerTransformation(number_bins=10))
    unary_transformations.append(MinMaxScalingTransformation())

    binary_transformations.extend(
        HigherOrderCommutativeClassGenerator(
            2,
            methods=[np.nansum, np.nanprod],
            sympy_methods=[sympy.Add, sympy.Mul]).produce())
    binary_transformations.extend(
        NumpyBinaryClassGenerator(methods=[np.divide, np.subtract],
                                  sympy_methods=[sympy_divide,
                                                 sympy_subtract]).produce())

    binary_transformations.extend(
        GroupByThenGenerator(
            2,
            methods=[np.nanmax, np.nanmin, np.nanmean, np.nanstd],
            sympy_methods=[
                groupbythenmax, groupbythenmin, groupbythenmean, groupbythenstd
            ]).produce())

    unary_transformations.extend(
        OneHotGenerator(train_X_all, raw_features).produce())

    return unary_transformations, binary_transformations
Exemplo n.º 5
0
    def run(self):
        self.global_starting_time = time.time()

        # generate all candidates
        self.generate(42)
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        myfolds = copy.deepcopy(list(self.preprocessed_folds))

        level_scores: Dict[int, List[float]] = {}
        level_test_scores: Dict[int, List[float]] = {}

        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/eucalyptus')
        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/contraceptive')
        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/diabetes')
        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/credit')
        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/heart_new_all')
        #string2candidate = self.load_data_all('/tmp')

        features = pickle.load(open('/tmp/cover_features.p', "rb"))

        #apply minmax scaling
        new_features: List[CandidateFeature] = []
        for f in features:
            new_features.append(
                CandidateFeature(MinMaxScalingTransformation(), [f]))

        results = self.evaluate_candidates([
            CandidateFeature(IdentityTransformation(len(new_features)),
                             new_features)
        ], myfolds)

        print(results[0])
        print(results[0].runtime_properties)

        return results[0]
Exemplo n.º 6
0
    def produce_features(self):
        unary_transformations: List[UnaryTransformation] = []
        unary_transformations.append(
            PandasDiscretizerTransformation(number_bins=10))
        unary_transformations.append(MinMaxScalingTransformation())

        higher_order_transformations: List[Transformation] = []
        higher_order_transformations.extend(
            HigherOrderCommutativeClassGenerator(
                2, methods=[np.nansum, np.nanprod]).produce())
        higher_order_transformations.extend(
            NumpyBinaryClassGenerator(
                methods=[np.divide, np.subtract]).produce())

        # count is missing
        higher_order_transformations.extend(
            GroupByThenGenerator(
                2, methods=[np.nanmax, np.nanmin, np.nanmean,
                            np.nanstd]).produce())

        Fui = self.generate_features1(unary_transformations, self.raw_features)

        Fi_and_Fui = []
        Fi_and_Fui.extend(self.raw_features)
        Fi_and_Fui.extend(Fui)

        Foi = self.generate_features1(higher_order_transformations, Fi_and_Fui)

        Foui = self.generate_features1(unary_transformations, Foi)

        Fi_cand = []
        Fi_cand.extend(Fui)
        Fi_cand.extend(Foi)
        Fi_cand.extend(Foui)

        return Fi_cand
Exemplo n.º 7
0
ground_truth = [28, 48, 64, 105, 128, 153, 241, 281, 318, 336, 338, 378, 433, 442, 451, 453, 455, 472, 475, 493]


print(len(ground_truth))

mask = np.zeros(len(numeric_representations), dtype=bool)
for i in range(len(numeric_representations)):
	for g in ground_truth:
		if str(numeric_representations[i]) == 'V' + str(g):
			mask[i] = True
			break

print(np.sum(mask))

all_features = CandidateFeature(IdentityTransformation(-1), numeric_representations)
all_standardized = CandidateFeature(MinMaxScalingTransformation(), [all_features])

#foreigner = np.array(X_train[:,7])
#gender = np.array(['female' in personal_status for personal_status in X_train[:,15]])

scoring = {'auc': make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True)}


#for count_i in range(10):
parameter_grid = {'model__penalty': ['l2'], 'model__C': [1], 'model__solver': ['lbfgs'],
				  'model__class_weight': ['balanced'], 'model__max_iter': [10000], 'model__multi_class': ['auto']}

my_pipeline = Pipeline([('features', all_standardized.pipeline),
						#('selection', L1Selection()),
						#('selection', SelectKBest(score_func=mutual_info_classif,k=10)),
						#('selection', SelectKBest(score_func=f_classif, k=10)),
Exemplo n.º 8
0
    def run_pipeline(self, which_features_to_use, runs=1):
        results = {}

        start_time = time.time()

        # generate pipeline
        results['complexity'] = 0
        all_selected_features = []
        for i in range(len(which_features_to_use)):
            if which_features_to_use[i]:
                all_selected_features.append(self.numeric_representations[i])
                results['complexity'] += self.numeric_representations[
                    i].get_complexity()

        all_features = CandidateFeature(IdentityTransformation(-1),
                                        all_selected_features)
        all_standardized = CandidateFeature(MinMaxScalingTransformation(),
                                            [all_features])

        my_pipeline = Pipeline([('f', all_standardized.pipeline),
                                ('c', self.model())])

        cv_scores = []
        test_scores = []
        pred_test = None
        proba_pred_test = None

        if runs > 1:
            for r in range(runs):
                kfolds = StratifiedKFold(10, shuffle=True, random_state=42 + r)
                self.pipeline = GridSearchCV(my_pipeline,
                                             self.parameter_grid,
                                             cv=kfolds.split(
                                                 self.X_train, self.y_train),
                                             scoring=self.scoring,
                                             n_jobs=4)
                self.pipeline.fit(self.X_train, self.y_train)

                pred_test = self.pipeline.predict(self.X_test)
                proba_pred_test = self.pipeline.predict_proba(self.X_test)

                test_auc = self.auc(self.pipeline, self.X_test, self.y_test)

                cv_scores.append(self.pipeline.best_score_)
                test_scores.append(test_auc)

            std_loss = np.std(cv_scores)
            loss = np.average(cv_scores)
        else:
            kfolds = StratifiedKFold(10, shuffle=True, random_state=42)
            self.pipeline = GridSearchCV(my_pipeline,
                                         self.parameter_grid,
                                         cv=kfolds.split(
                                             self.X_train, self.y_train),
                                         scoring=self.scoring,
                                         n_jobs=1,
                                         refit='auc')
            self.pipeline.fit(self.X_train, pd.DataFrame(self.y_train))

            pred_test = self.pipeline.predict(self.X_test)
            proba_pred_test = self.pipeline.predict_proba(self.X_test)

            test_auc = make_scorer(roc_auc_score,
                                   greater_is_better=True,
                                   needs_threshold=True)(self.pipeline,
                                                         self.X_test,
                                                         self.y_test)

            for k in self.scoring.keys():
                results[k] = self.pipeline.cv_results_['mean_test_' + str(k)][
                    self.pipeline.best_index_]

            loss = self.pipeline.cv_results_['mean_test_auc'][
                self.pipeline.best_index_]
            test_scores.append(test_auc)

        results['test_auc'] = np.average(test_scores)

        results['cv_time'] = time.time() - start_time
        results['global_time'] = time.time() - self.global_starting_time

        return results  #loss, np.average(test_scores), pred_test, 0.0, proba_pred_test
Exemplo n.º 9
0
from fastsklearnfeature.transformations.FastGroupByThenTransformation import FastGroupByThenTransformation
from fastsklearnfeature.transformations.PandasDiscretizerTransformation import PandasDiscretizerTransformation
from fastsklearnfeature.transformations.MinMaxScalingTransformation import MinMaxScalingTransformation

import numpy as np

input = np.random.randint(1000, size=(10000, 1))

p = PandasDiscretizerTransformation(number_bins=10)
s = MinMaxScalingTransformation()

first = s.fit_transform(input)
second = s.fit_transform(first)

print(np.allclose(first, second))

first = s.fit_transform(input)
second = p.fit_transform(first)
third = s.fit_transform(second)

print(np.allclose(third, second))
Exemplo n.º 10
0
    def generate_candidates(self):

        unary_transformations: List[UnaryTransformation] = []
        unary_transformations.append(
            PandasDiscretizerTransformation(number_bins=10))
        unary_transformations.append(MinMaxScalingTransformation())

        higher_order_transformations: List[Transformation] = []
        higher_order_transformations.extend(
            HigherOrderCommutativeClassGenerator(
                2, methods=[np.nansum, np.nanprod]).produce())
        higher_order_transformations.extend(
            NumpyBinaryClassGenerator(
                methods=[np.divide, np.subtract]).produce())

        #count is missing
        higher_order_transformations.extend(
            GroupByThenGenerator(
                2, methods=[np.nanmax, np.nanmin, np.nanmean,
                            np.nanstd]).produce())

        transformations = []
        transformations.extend(unary_transformations)
        transformations.extend(higher_order_transformations)
        #transformations.append(IdentityTransformation(2))

        print("unary transformations: " + str(len(unary_transformations)))
        print("higherorder transformations: " +
              str(len(higher_order_transformations)))

        features = self.Fi
        '''
        graph = nx.DiGraph()


        graph.add_node('root')
        for f in features:
            graph.add_node(str(f))
            graph.node[str(f)]['feature'] = f
            graph.add_edge('root', str(f))
        '''

        F0 = features

        F = []
        F.append(F0)
        '''
        for depth in range(2):
            F_t_plus_1 = []
            for t_i in transformations:
                for f_i in t_i.get_combinations(list(itertools.chain(*F[0:depth+1]))):
                    if t_i.is_applicable(f_i):
                        current_feature = CandidateFeature(copy.deepcopy(t_i), f_i)
                        print(current_feature)

                        
                        graph.add_node(str(current_feature))
                        graph.node[str(current_feature)]['feature'] = current_feature
                        for parent_feature in f_i:
                            graph.add_edge(str(parent_feature), str(current_feature))
                        
                        F_t_plus_1.append(current_feature)
            F.append(F_t_plus_1)

            print(len(list(itertools.chain(*F))))

        #self.plot_graph(graph)
        '''

        for depth in range(3):
            results = self.generate_in_parallel(transformations,
                                                F[0:depth + 1])
            F.append(results)

            print(len(list(itertools.chain(*F))))
Exemplo n.º 11
0
def run_pipeline(which_features_to_use, c=None, runs=1):

	model = LogisticRegression

	if type(c) == type(None):
		c = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
	else:
		c = [c]

	parameter_grid = {'c__penalty': ['l2'], 'c__C': c, 'c__solver': ['lbfgs'],
					  'c__class_weight': ['balanced'], 'c__max_iter': [10000], 'c__multi_class': ['auto']}

	auc = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True)

	numeric_representations = pickle.load(open("/home/felix/phd/feature_constraints/" + str(which_experiment) + "/features.p", "rb"))

	#print(len(numeric_representations))

	#X_train, X_test, y_train, y_test
	X_train = pickle.load(open("/home/felix/phd/feature_constraints/" + str(which_experiment) + "/X_train.p", "rb"))
	X_test = pickle.load(open("/home/felix/phd/feature_constraints/" + str(which_experiment) + "/X_test.p", "rb"))
	y_train = pickle.load(open("/home/felix/phd/feature_constraints/" + str(which_experiment) + "/y_train.p", "rb"))
	y_test = pickle.load(open("/home/felix/phd/feature_constraints/" + str(which_experiment) + "/y_test.p", "rb"))



	#generate pipeline
	all_selected_features = []
	for i in range(len(which_features_to_use)):
		if which_features_to_use[i]:
			all_selected_features.append(numeric_representations[i])

	all_features = CandidateFeature(IdentityTransformation(-1), all_selected_features)
	all_standardized = CandidateFeature(MinMaxScalingTransformation(), [all_features])

	my_pipeline = Pipeline([('f', all_standardized.pipeline),
							('c', model())
							])

	cv_scores = []
	test_scores = []
	pred_test = None
	proba_pred_test = None

	if runs > 1:
		for r in range(runs):
			kfolds = StratifiedKFold(10, shuffle=True, random_state=42+r)
			pipeline = GridSearchCV(my_pipeline, parameter_grid, cv=kfolds.split(X_train, y_train), scoring=auc, n_jobs=4)
			pipeline.fit(X_train, y_train)

			pred_test = pipeline.predict(X_test)
			proba_pred_test = pipeline.predict_proba(X_test)

			test_auc = auc(pipeline, X_test, y_test)

			cv_scores.append(pipeline.best_score_)
			test_scores.append(test_auc)

		std_loss = np.std(cv_scores)
		loss = np.average(cv_scores)
	else:
		kfolds = StratifiedKFold(10, shuffle=True, random_state=42)
		pipeline = GridSearchCV(my_pipeline, parameter_grid, cv=kfolds.split(X_train, y_train), scoring=auc, n_jobs=4)
		pipeline.fit(X_train, y_train)

		pred_test = pipeline.predict(X_test)
		proba_pred_test = pipeline.predict_proba(X_test)

		test_auc = auc(pipeline, X_test, y_test)

		std_loss = pipeline.cv_results_['std_test_score'][pipeline.best_index_]
		#std_loss = np.min([pipeline.cv_results_['split' + str(split)+ '_test_score'][pipeline.best_index_] for split in range(10)])
		loss = pipeline.cv_results_['mean_test_score'][pipeline.best_index_]
		test_scores.append(test_auc)

		print(pipeline.classes_)

	return loss, np.average(test_scores), pred_test, std_loss, proba_pred_test