def get_transformation_for_division(train_X_all, raw_features): unary_transformations: List[UnaryTransformation] = [] binary_transformations: List[Transformation] = [] #unary_transformations.append(PandasDiscretizerTransformation(number_bins=10)) unary_transformations.append(MinMaxScalingTransformation()) unary_transformations.append(MDLPDiscretizerTransformation()) unary_transformations.append(OneDivisionTransformation()) unary_transformations.append(MinusTransformation()) unary_transformations.append(ImputationTransformation('mean')) #unary_transformations.append(ImputationTransformation('median')) #unary_transformations.append(ImputationTransformation('most_frequent')) binary_transformations.extend( HigherOrderCommutativeClassGenerator( 2, methods=[np.nansum, np.nanprod], sympy_methods=[sympy.Add, sympy.Mul]).produce()) binary_transformations.extend( GroupByThenGenerator( 2, methods=[np.nanmax, np.nanmin, np.nanmean, np.nanstd], sympy_methods=[ groupbythenmax, groupbythenmin, groupbythenmean, groupbythenstd ]).produce()) unary_transformations.extend( OneHotGenerator(train_X_all, raw_features).produce()) return unary_transformations, binary_transformations
def fit(self, X, y=None): fe = ComplexityDrivenFeatureConstruction( None, reader=ScikitReader( X, y, feature_names=self.feature_names, feature_is_categorical=self.feature_is_categorical), score=self.scoring, c_max=self.c_max, folds=self.cv, max_seconds=self.max_time_secs, classifier=self.model.__class__, grid_search_parameters=self.parameter_grid, n_jobs=self.n_jobs, epsilon=self.epsilon, remove_parents=False, transformation_producer=self.transformation_producer) fe.run() numeric_representations = [] for r in fe.all_representations: if 'score' in r.runtime_properties: if not 'object' in str(r.properties['type']): if not isinstance(r.transformation, MinMaxScalingTransformation): #if not (isinstance(r.transformation, HigherOrderCommutativeTransformation) and r.transformation.method == np.nansum): if isinstance(r.sympy_representation, sympy.Mul): found = False for e in r.sympy_representation._args: if e == S.NegativeOne: found = True if found == False: numeric_representations.append(r) else: numeric_representations.append(r) self.numeric_features = numeric_representations my_list = [] for ff in self.numeric_features: my_list.append(str(ff)) with open('/tmp/names.pickle', 'wb') as f: pickle.dump(X, f, pickle.HIGHEST_PROTOCOL) all_features = CandidateFeature(IdentityTransformation(-1), numeric_representations) #all_imputation = CandidateFeature(ImputationTransformation(), [all_features]) all_standardized = CandidateFeature(MinMaxScalingTransformation(), [all_features]) #all_standardized = CandidateFeature(MinMaxScalingTransformation(), [all_features]) self.pipeline_ = all_standardized.pipeline self.pipeline_.fit(X, y) return self
def run(self): self.global_starting_time = time.time() # generate all candidates self.generate(42) #starting_feature_matrix = self.create_starting_features() self.generate_target() myfolds = copy.deepcopy(list(self.preprocessed_folds)) level_scores: Dict[int, List[float]] = {} level_test_scores: Dict[int, List[float]] = {} #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/eucalyptus') #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/contraceptive') #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/diabetes') #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/credit') #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/heart_new_all') #string2candidate = self.load_data_all('/tmp') baseline_features: List[CandidateFeature] = [] for r in self.raw_features: if r.is_numeric() and not r.properties['categorical']: if not r.properties['missing_values']: baseline_features.append(r) else: baseline_features.append( CandidateFeature(ImputationTransformation(), [r])) else: baseline_features.extend([ CandidateFeature(t, [r]) for t in OneHotGenerator(self.train_X_all, [r]).produce() ]) #baseline_features.extend(self.get_interesting_features('/home/felix/phd/fastfeatures/results/heart_small', 24)) #baseline_features.extend(self.get_interesting_features('/home/felix/phd/fastfeatures/results/heart_new_all', 10)) #baseline_features.extend(self.get_interesting_features(string2candidate, 2)) ''' for c in baseline_features: if isinstance(c, RawFeature): print(str(c) + " complexity: " + str(c.get_complexity())) else: print('nr: ' + str(c) + " complexity: " + str(c.get_complexity()))+ ''' # standardize scaled_baseline_features = [] for c in baseline_features: scaled_baseline_features.append( CandidateFeature(MinMaxScalingTransformation(), [c])) #scaled_baseline_features = baseline_features combo = CandidateFeature( IdentityTransformation(len(baseline_features)), scaled_baseline_features) results = self.evaluate_candidates_detail([combo], myfolds, 1) print(str(results[0].runtime_properties))
def get_transformation_for_cat_feature_space(train_X_all, raw_features): unary_transformations: List[UnaryTransformation] = [] binary_transformations: List[Transformation] = [] unary_transformations.append( PandasDiscretizerTransformation(number_bins=10)) unary_transformations.append(MinMaxScalingTransformation()) binary_transformations.extend( HigherOrderCommutativeClassGenerator( 2, methods=[np.nansum, np.nanprod], sympy_methods=[sympy.Add, sympy.Mul]).produce()) binary_transformations.extend( NumpyBinaryClassGenerator(methods=[np.divide, np.subtract], sympy_methods=[sympy_divide, sympy_subtract]).produce()) binary_transformations.extend( GroupByThenGenerator( 2, methods=[np.nanmax, np.nanmin, np.nanmean, np.nanstd], sympy_methods=[ groupbythenmax, groupbythenmin, groupbythenmean, groupbythenstd ]).produce()) unary_transformations.extend( OneHotGenerator(train_X_all, raw_features).produce()) return unary_transformations, binary_transformations
def run(self): self.global_starting_time = time.time() # generate all candidates self.generate(42) #starting_feature_matrix = self.create_starting_features() self.generate_target() myfolds = copy.deepcopy(list(self.preprocessed_folds)) level_scores: Dict[int, List[float]] = {} level_test_scores: Dict[int, List[float]] = {} #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/eucalyptus') #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/contraceptive') #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/diabetes') #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/credit') #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/heart_new_all') #string2candidate = self.load_data_all('/tmp') features = pickle.load(open('/tmp/cover_features.p', "rb")) #apply minmax scaling new_features: List[CandidateFeature] = [] for f in features: new_features.append( CandidateFeature(MinMaxScalingTransformation(), [f])) results = self.evaluate_candidates([ CandidateFeature(IdentityTransformation(len(new_features)), new_features) ], myfolds) print(results[0]) print(results[0].runtime_properties) return results[0]
def produce_features(self): unary_transformations: List[UnaryTransformation] = [] unary_transformations.append( PandasDiscretizerTransformation(number_bins=10)) unary_transformations.append(MinMaxScalingTransformation()) higher_order_transformations: List[Transformation] = [] higher_order_transformations.extend( HigherOrderCommutativeClassGenerator( 2, methods=[np.nansum, np.nanprod]).produce()) higher_order_transformations.extend( NumpyBinaryClassGenerator( methods=[np.divide, np.subtract]).produce()) # count is missing higher_order_transformations.extend( GroupByThenGenerator( 2, methods=[np.nanmax, np.nanmin, np.nanmean, np.nanstd]).produce()) Fui = self.generate_features1(unary_transformations, self.raw_features) Fi_and_Fui = [] Fi_and_Fui.extend(self.raw_features) Fi_and_Fui.extend(Fui) Foi = self.generate_features1(higher_order_transformations, Fi_and_Fui) Foui = self.generate_features1(unary_transformations, Foi) Fi_cand = [] Fi_cand.extend(Fui) Fi_cand.extend(Foi) Fi_cand.extend(Foui) return Fi_cand
ground_truth = [28, 48, 64, 105, 128, 153, 241, 281, 318, 336, 338, 378, 433, 442, 451, 453, 455, 472, 475, 493] print(len(ground_truth)) mask = np.zeros(len(numeric_representations), dtype=bool) for i in range(len(numeric_representations)): for g in ground_truth: if str(numeric_representations[i]) == 'V' + str(g): mask[i] = True break print(np.sum(mask)) all_features = CandidateFeature(IdentityTransformation(-1), numeric_representations) all_standardized = CandidateFeature(MinMaxScalingTransformation(), [all_features]) #foreigner = np.array(X_train[:,7]) #gender = np.array(['female' in personal_status for personal_status in X_train[:,15]]) scoring = {'auc': make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True)} #for count_i in range(10): parameter_grid = {'model__penalty': ['l2'], 'model__C': [1], 'model__solver': ['lbfgs'], 'model__class_weight': ['balanced'], 'model__max_iter': [10000], 'model__multi_class': ['auto']} my_pipeline = Pipeline([('features', all_standardized.pipeline), #('selection', L1Selection()), #('selection', SelectKBest(score_func=mutual_info_classif,k=10)), #('selection', SelectKBest(score_func=f_classif, k=10)),
def run_pipeline(self, which_features_to_use, runs=1): results = {} start_time = time.time() # generate pipeline results['complexity'] = 0 all_selected_features = [] for i in range(len(which_features_to_use)): if which_features_to_use[i]: all_selected_features.append(self.numeric_representations[i]) results['complexity'] += self.numeric_representations[ i].get_complexity() all_features = CandidateFeature(IdentityTransformation(-1), all_selected_features) all_standardized = CandidateFeature(MinMaxScalingTransformation(), [all_features]) my_pipeline = Pipeline([('f', all_standardized.pipeline), ('c', self.model())]) cv_scores = [] test_scores = [] pred_test = None proba_pred_test = None if runs > 1: for r in range(runs): kfolds = StratifiedKFold(10, shuffle=True, random_state=42 + r) self.pipeline = GridSearchCV(my_pipeline, self.parameter_grid, cv=kfolds.split( self.X_train, self.y_train), scoring=self.scoring, n_jobs=4) self.pipeline.fit(self.X_train, self.y_train) pred_test = self.pipeline.predict(self.X_test) proba_pred_test = self.pipeline.predict_proba(self.X_test) test_auc = self.auc(self.pipeline, self.X_test, self.y_test) cv_scores.append(self.pipeline.best_score_) test_scores.append(test_auc) std_loss = np.std(cv_scores) loss = np.average(cv_scores) else: kfolds = StratifiedKFold(10, shuffle=True, random_state=42) self.pipeline = GridSearchCV(my_pipeline, self.parameter_grid, cv=kfolds.split( self.X_train, self.y_train), scoring=self.scoring, n_jobs=1, refit='auc') self.pipeline.fit(self.X_train, pd.DataFrame(self.y_train)) pred_test = self.pipeline.predict(self.X_test) proba_pred_test = self.pipeline.predict_proba(self.X_test) test_auc = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True)(self.pipeline, self.X_test, self.y_test) for k in self.scoring.keys(): results[k] = self.pipeline.cv_results_['mean_test_' + str(k)][ self.pipeline.best_index_] loss = self.pipeline.cv_results_['mean_test_auc'][ self.pipeline.best_index_] test_scores.append(test_auc) results['test_auc'] = np.average(test_scores) results['cv_time'] = time.time() - start_time results['global_time'] = time.time() - self.global_starting_time return results #loss, np.average(test_scores), pred_test, 0.0, proba_pred_test
from fastsklearnfeature.transformations.FastGroupByThenTransformation import FastGroupByThenTransformation from fastsklearnfeature.transformations.PandasDiscretizerTransformation import PandasDiscretizerTransformation from fastsklearnfeature.transformations.MinMaxScalingTransformation import MinMaxScalingTransformation import numpy as np input = np.random.randint(1000, size=(10000, 1)) p = PandasDiscretizerTransformation(number_bins=10) s = MinMaxScalingTransformation() first = s.fit_transform(input) second = s.fit_transform(first) print(np.allclose(first, second)) first = s.fit_transform(input) second = p.fit_transform(first) third = s.fit_transform(second) print(np.allclose(third, second))
def generate_candidates(self): unary_transformations: List[UnaryTransformation] = [] unary_transformations.append( PandasDiscretizerTransformation(number_bins=10)) unary_transformations.append(MinMaxScalingTransformation()) higher_order_transformations: List[Transformation] = [] higher_order_transformations.extend( HigherOrderCommutativeClassGenerator( 2, methods=[np.nansum, np.nanprod]).produce()) higher_order_transformations.extend( NumpyBinaryClassGenerator( methods=[np.divide, np.subtract]).produce()) #count is missing higher_order_transformations.extend( GroupByThenGenerator( 2, methods=[np.nanmax, np.nanmin, np.nanmean, np.nanstd]).produce()) transformations = [] transformations.extend(unary_transformations) transformations.extend(higher_order_transformations) #transformations.append(IdentityTransformation(2)) print("unary transformations: " + str(len(unary_transformations))) print("higherorder transformations: " + str(len(higher_order_transformations))) features = self.Fi ''' graph = nx.DiGraph() graph.add_node('root') for f in features: graph.add_node(str(f)) graph.node[str(f)]['feature'] = f graph.add_edge('root', str(f)) ''' F0 = features F = [] F.append(F0) ''' for depth in range(2): F_t_plus_1 = [] for t_i in transformations: for f_i in t_i.get_combinations(list(itertools.chain(*F[0:depth+1]))): if t_i.is_applicable(f_i): current_feature = CandidateFeature(copy.deepcopy(t_i), f_i) print(current_feature) graph.add_node(str(current_feature)) graph.node[str(current_feature)]['feature'] = current_feature for parent_feature in f_i: graph.add_edge(str(parent_feature), str(current_feature)) F_t_plus_1.append(current_feature) F.append(F_t_plus_1) print(len(list(itertools.chain(*F)))) #self.plot_graph(graph) ''' for depth in range(3): results = self.generate_in_parallel(transformations, F[0:depth + 1]) F.append(results) print(len(list(itertools.chain(*F))))
def run_pipeline(which_features_to_use, c=None, runs=1): model = LogisticRegression if type(c) == type(None): c = [0.001, 0.01, 0.1, 1, 10, 100, 1000] else: c = [c] parameter_grid = {'c__penalty': ['l2'], 'c__C': c, 'c__solver': ['lbfgs'], 'c__class_weight': ['balanced'], 'c__max_iter': [10000], 'c__multi_class': ['auto']} auc = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True) numeric_representations = pickle.load(open("/home/felix/phd/feature_constraints/" + str(which_experiment) + "/features.p", "rb")) #print(len(numeric_representations)) #X_train, X_test, y_train, y_test X_train = pickle.load(open("/home/felix/phd/feature_constraints/" + str(which_experiment) + "/X_train.p", "rb")) X_test = pickle.load(open("/home/felix/phd/feature_constraints/" + str(which_experiment) + "/X_test.p", "rb")) y_train = pickle.load(open("/home/felix/phd/feature_constraints/" + str(which_experiment) + "/y_train.p", "rb")) y_test = pickle.load(open("/home/felix/phd/feature_constraints/" + str(which_experiment) + "/y_test.p", "rb")) #generate pipeline all_selected_features = [] for i in range(len(which_features_to_use)): if which_features_to_use[i]: all_selected_features.append(numeric_representations[i]) all_features = CandidateFeature(IdentityTransformation(-1), all_selected_features) all_standardized = CandidateFeature(MinMaxScalingTransformation(), [all_features]) my_pipeline = Pipeline([('f', all_standardized.pipeline), ('c', model()) ]) cv_scores = [] test_scores = [] pred_test = None proba_pred_test = None if runs > 1: for r in range(runs): kfolds = StratifiedKFold(10, shuffle=True, random_state=42+r) pipeline = GridSearchCV(my_pipeline, parameter_grid, cv=kfolds.split(X_train, y_train), scoring=auc, n_jobs=4) pipeline.fit(X_train, y_train) pred_test = pipeline.predict(X_test) proba_pred_test = pipeline.predict_proba(X_test) test_auc = auc(pipeline, X_test, y_test) cv_scores.append(pipeline.best_score_) test_scores.append(test_auc) std_loss = np.std(cv_scores) loss = np.average(cv_scores) else: kfolds = StratifiedKFold(10, shuffle=True, random_state=42) pipeline = GridSearchCV(my_pipeline, parameter_grid, cv=kfolds.split(X_train, y_train), scoring=auc, n_jobs=4) pipeline.fit(X_train, y_train) pred_test = pipeline.predict(X_test) proba_pred_test = pipeline.predict_proba(X_test) test_auc = auc(pipeline, X_test, y_test) std_loss = pipeline.cv_results_['std_test_score'][pipeline.best_index_] #std_loss = np.min([pipeline.cv_results_['split' + str(split)+ '_test_score'][pipeline.best_index_] for split in range(10)]) loss = pipeline.cv_results_['mean_test_score'][pipeline.best_index_] test_scores.append(test_auc) print(pipeline.classes_) return loss, np.average(test_scores), pred_test, std_loss, proba_pred_test