예제 #1
0
    def fit(self, X, y=None):
        fe = ComplexityDrivenFeatureConstruction(
            None,
            reader=ScikitReader(
                X,
                y,
                feature_names=self.feature_names,
                feature_is_categorical=self.feature_is_categorical),
            score=self.scoring,
            c_max=self.c_max,
            folds=self.cv,
            max_seconds=self.max_time_secs,
            classifier=self.model.__class__,
            grid_search_parameters=self.parameter_grid,
            n_jobs=self.n_jobs,
            epsilon=self.epsilon,
            remove_parents=False,
            transformation_producer=self.transformation_producer)

        fe.run()

        numeric_representations = []
        for r in fe.all_representations:
            if 'score' in r.runtime_properties:
                if not 'object' in str(r.properties['type']):
                    if not isinstance(r.transformation,
                                      MinMaxScalingTransformation):
                        #if not (isinstance(r.transformation, HigherOrderCommutativeTransformation) and r.transformation.method == np.nansum):
                        if isinstance(r.sympy_representation, sympy.Mul):
                            found = False
                            for e in r.sympy_representation._args:
                                if e == S.NegativeOne:
                                    found = True
                            if found == False:
                                numeric_representations.append(r)
                        else:
                            numeric_representations.append(r)

        self.numeric_features = numeric_representations

        my_list = []
        for ff in self.numeric_features:
            my_list.append(str(ff))

        with open('/tmp/names.pickle', 'wb') as f:
            pickle.dump(X, f, pickle.HIGHEST_PROTOCOL)

        all_features = CandidateFeature(IdentityTransformation(-1),
                                        numeric_representations)

        #all_imputation = CandidateFeature(ImputationTransformation(), [all_features])
        all_standardized = CandidateFeature(MinMaxScalingTransformation(),
                                            [all_features])

        #all_standardized = CandidateFeature(MinMaxScalingTransformation(), [all_features])

        self.pipeline_ = all_standardized.pipeline

        self.pipeline_.fit(X, y)
        return self
예제 #2
0
    def fit(self, features, target, sample_weight=None, groups=None):
        #self.fe = ComplexityDrivenFeatureConstruction(None, reader=ScikitReader(features, target, feature_names=self.feature_names, feature_is_categorical=self.feature_is_categorical),
        #                                              score=self.scoring, c_max=np.inf, folds=10, max_seconds=self.max_time_secs, classifier=self.model, grid_search_parameters=self.parameter_grid, n_jobs=self.n_jobs, epsilon=self.epsilon)

        self.fe = ComplexityDrivenFeatureConstruction(
            None,
            reader=ScikitReader(
                features,
                target,
                feature_names=self.feature_names,
                feature_is_categorical=self.feature_is_categorical),
            score=self.scoring,
            c_max=6,
            folds=10,
            max_seconds=self.max_time_secs,
            classifier=self.model,
            grid_search_parameters=self.parameter_grid,
            n_jobs=self.n_jobs,
            epsilon=0.0)

        self.max_feature_rep = self.fe.run()

        self.pipeline = self.generate_pipeline().fit(features, target)
예제 #3
0
파일: bench_new.py 프로젝트: BigDaMa/DFS
y = data[class_column_name]
data_no_class = data[data.columns.difference([class_column_name])]

X = data_no_class.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=42, stratify=y)
'''


parameter_grid = {'penalty': ['l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'solver': ['lbfgs'],
				  'class_weight': ['balanced'], 'max_iter': [10000], 'multi_class': ['auto']}

auc=make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True)

fe = ComplexityDrivenFeatureConstruction(None, reader=ScikitReader(X_train, y_train, feature_names=['V' + str(i) for i in range(X_train.shape[1])]),
                                                      score=auc, c_max=1, folds=2,
                                                      classifier=LogisticRegression,
                                                      grid_search_parameters=parameter_grid, n_jobs=4,
                                                      epsilon=0.0)

fe.run()



numeric_representations = []

feature_names = []
for r in fe.all_representations:
	if 'score' in r.runtime_properties:
		if not 'object' in str(r.properties['type']):
			if not isinstance(r.transformation, MinusTransformation):
				print(str(r) + ':' + str(r.properties['type']) + ' : ' + str(r.runtime_properties['score']))
예제 #4
0
class ComplexityDrivenFeatureConstructionScikit:
    def __init__(
            self,
            max_time_secs=None,
            scoring=make_scorer(f1_score, average='micro'),
            model=LogisticRegression,
            parameter_grid={
                'penalty': ['l2'],
                'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                'solver': ['lbfgs'],
                'class_weight': ['balanced'],
                'max_iter': [10000],
                'multi_class': ['auto']
            },
            n_jobs=None,
            epsilon=-np.inf,
            feature_names=None,
            feature_is_categorical=None):
        self.fe = None
        self.max_feature_rep: CandidateFeature = None
        self.pipeline = None
        self.max_time_secs = max_time_secs
        self.scoring = scoring
        self.model = model
        self.parameter_grid = parameter_grid
        self.n_jobs = n_jobs
        self.epsilon = epsilon
        self.feature_names = feature_names
        self.feature_is_categorical = feature_is_categorical

    def fit(self, features, target, sample_weight=None, groups=None):
        #self.fe = ComplexityDrivenFeatureConstruction(None, reader=ScikitReader(features, target, feature_names=self.feature_names, feature_is_categorical=self.feature_is_categorical),
        #                                              score=self.scoring, c_max=np.inf, folds=10, max_seconds=self.max_time_secs, classifier=self.model, grid_search_parameters=self.parameter_grid, n_jobs=self.n_jobs, epsilon=self.epsilon)

        self.fe = ComplexityDrivenFeatureConstruction(
            None,
            reader=ScikitReader(
                features,
                target,
                feature_names=self.feature_names,
                feature_is_categorical=self.feature_is_categorical),
            score=self.scoring,
            c_max=6,
            folds=10,
            max_seconds=self.max_time_secs,
            classifier=self.model,
            grid_search_parameters=self.parameter_grid,
            n_jobs=self.n_jobs,
            epsilon=0.0)

        self.max_feature_rep = self.fe.run()

        self.pipeline = self.generate_pipeline().fit(features, target)

    def generate_pipeline(self):
        best_hyperparameters = self.max_feature_rep.runtime_properties[
            'hyperparameters']

        all_keys = list(best_hyperparameters.keys())
        for k in all_keys:
            if 'classifier__' in k:
                best_hyperparameters[k[12:]] = best_hyperparameters.pop(k)
        '''
        my_pipeline = ImbalancePipeline([('f', PipelineTransformation(self.max_feature_rep.pipeline)),
                                ('smote', SMOTE()),
                                ('c', self.fe.classifier(**best_hyperparameters))
                                ])
        '''
        my_pipeline = Pipeline([('f', self.max_feature_rep.pipeline),
                                ('c',
                                 self.fe.classifier(**best_hyperparameters))])

        return my_pipeline

    def predict(self, features):
        return self.pipeline.predict(features)

    def predict_proba(self, features):
        return self.pipeline.predict_proba(features)
예제 #5
0
y = data[class_column_name]
data_no_class = data[data.columns.difference([class_column_name])]

X = data[data.columns.difference([class_column_name])].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=42, stratify=y)

model = LogisticRegression
parameter_grid = {'penalty': ['l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'solver': ['lbfgs'],
				  'class_weight': ['balanced'], 'max_iter': [10000], 'multi_class': ['auto']}

auc=make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True)

fe = ComplexityDrivenFeatureConstruction(None, reader=ScikitReader(X_train, y_train,
                                                                                feature_names=data[data.columns.difference([class_column_name])].columns),
                                                      score=auc, c_max=2, folds=10,
                                                      classifier=LogisticRegression,
                                                      grid_search_parameters=parameter_grid, n_jobs=4,
                                                      epsilon=0.0)

fe.run()


numeric_representations = []

feature_names = []
for r in fe.all_representations:
	if 'score' in r.runtime_properties:
		if not 'object' in str(r.properties['type']):
			if not isinstance(r.transformation, MinusTransformation):
				print(str(r) + ':' + str(r.properties['type']) + ' : ' + str(r.runtime_properties['score']))
				numeric_representations.append(r)