def run(self):
        # generate all candidates
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        #working_features = self.filter_failing_in_parallel()
        #all_f = CandidateFeature(IdentityTransformation(len(working_features)), working_features)

        all_f = CandidateFeature(
            IdentityTransformation(len(self.raw_features)), self.raw_features)

        my_list = []

        for i in range(1, len(self.raw_features) + 1):
            my_list.append(
                CandidateFeature(BorutaTransformer(len(self.raw_features), i),
                                 [all_f]))

        #my_list.append(CandidateFeature(SissoTransformer(len(self.raw_features)), [all_f]))

        results = self.evaluate_candidates(my_list)

        print(results)

        for r in range(len(results)):
            print("(" + str(r + 1) + "," + str(results[r]['score']) + ")")

        new_scores = [r['score'] for r in results]
        best_id = np.argmax(new_scores)

        print(results[best_id])
예제 #2
0
    def run(self):

        # generate all candidates
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        self.global_starting_time = time.time()

        for k in range(1, len(self.raw_features)+1):


            all_f = CandidateFeature(IdentityTransformation(len(self.raw_features)), self.raw_features)


            t = CandidateFeature(SelectKBestTransformer(len(self.raw_features),k), [all_f])

            t.pipeline.fit(self.dataset.splitted_values['train'], self.current_target)
            X = t.transform(self.dataset.splitted_values['train'])
            X_test = t.transform(self.dataset.splitted_values['test'])

            print("time: " + str(time.time() - self.global_starting_time))

            clf = GridSearchCV(self.classifier(), self.grid_search_parameters, cv=self.preprocessed_folds, scoring=self.score, iid=False,
                               error_score='raise')
            clf.fit(X, self.current_target)

            print('test score: ' + str(clf.score(X_test, self.test_target)))
            print("\n\n")
예제 #3
0
    def run(self):
        self.global_starting_time = time.time()

        # generate all candidates
        self.generate(42)
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        myfolds = copy.deepcopy(list(self.preprocessed_folds))

        level_scores: Dict[int, List[float]] = {}
        level_test_scores: Dict[int, List[float]] = {}

        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/eucalyptus')
        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/contraceptive')
        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/diabetes')
        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/credit')
        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/heart_new_all')
        #string2candidate = self.load_data_all('/tmp')

        baseline_features: List[CandidateFeature] = []
        for r in self.raw_features:
            if r.is_numeric() and not r.properties['categorical']:
                if not r.properties['missing_values']:
                    baseline_features.append(r)
                else:
                    baseline_features.append(
                        CandidateFeature(ImputationTransformation(), [r]))
            else:
                baseline_features.extend([
                    CandidateFeature(t, [r])
                    for t in OneHotGenerator(self.train_X_all, [r]).produce()
                ])

        #baseline_features.extend(self.get_interesting_features('/home/felix/phd/fastfeatures/results/heart_small', 24))
        #baseline_features.extend(self.get_interesting_features('/home/felix/phd/fastfeatures/results/heart_new_all', 10))
        #baseline_features.extend(self.get_interesting_features(string2candidate, 2))
        '''
        for c in baseline_features:
            if isinstance(c, RawFeature):
                print(str(c) + " complexity: " + str(c.get_complexity()))
            else:
                print('nr: ' + str(c) + " complexity: " + str(c.get_complexity()))+
        '''

        # standardize
        scaled_baseline_features = []
        for c in baseline_features:
            scaled_baseline_features.append(
                CandidateFeature(MinMaxScalingTransformation(), [c]))

        #scaled_baseline_features = baseline_features

        combo = CandidateFeature(
            IdentityTransformation(len(baseline_features)),
            scaled_baseline_features)

        results = self.evaluate_candidates_detail([combo], myfolds, 1)

        print(str(results[0].runtime_properties))
예제 #4
0
    def run(self):
        # generate all candidates
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        working_features = self.filter_failing_features()

        all_f = CandidateFeature(IdentityTransformation(len(working_features)),
                                 working_features)

        selection = CandidateFeature(
            FeatureSelectionTransformation(
                1, 2,
                LogisticRegression(penalty='l2',
                                   solver='lbfgs',
                                   class_weight='balanced',
                                   max_iter=10000)), [all_f])

        results = self.evaluate_candidates([selection])

        new_scores = [r['score'] for r in results]
        best_id = np.argmax(new_scores)

        print(results[best_id])
예제 #5
0
    def sisso_transfusion_features_new3(
            self, name2feature) -> List[CandidateFeature]:
        sisso_features = []
        sisso_features.extend(self.raw_features)

        squared_recency = CandidateFeature(
            HigherOrderCommutativeTransformation(np.prod, 2),
            [name2feature['Recency'], name2feature['Recency']])

        squared_monetary = CandidateFeature(
            HigherOrderCommutativeTransformation(np.prod, 2),
            [name2feature['Monetary'], name2feature['Monetary']])

        sisso_features.append(
            CandidateFeature(NonCommutativeBinaryTransformation(np.divide),
                             [name2feature['Recency'], name2feature['Time']]))

        sisso_features.append(
            CandidateFeature(NonCommutativeBinaryTransformation(np.divide),
                             [name2feature['Monetary'], name2feature['Time']]))

        sisso_features.append(
            CandidateFeature(NonCommutativeBinaryTransformation(np.divide),
                             [squared_monetary, name2feature['Time']]))

        sisso_features.append(
            CandidateFeature(NonCommutativeBinaryTransformation(np.divide),
                             [squared_recency, name2feature['Time']]))

        all_f = CandidateFeature(IdentityTransformation(len(sisso_features)),
                                 sisso_features)
        return [all_f]
예제 #6
0
    def generate_merge_for_combination(self, all_evaluated_features, a: List[CandidateFeature], b: List[CandidateFeature]) -> Set[Set[CandidateFeature]]:
        cat_candidates_to_be_applied = []
        id_t = IdentityTransformation(None)
        for a_i in range(len(a)):
            for b_i in range(len(b)):
                combo = [a[a_i], b[b_i]]
                if id_t.is_applicable(combo):

                    sympy_representation = id_t.get_sympy_representation([p.get_sympy_representation() for p in combo])
                    if not sympy_representation in all_evaluated_features:
                        cat_candidate = CandidateFeature(copy.deepcopy(id_t), combo)
                        cat_candidate.sympy_representation = copy.deepcopy(sympy_representation)
                        all_evaluated_features.add(sympy_representation)
                        cat_candidates_to_be_applied.append(cat_candidate)

        return cat_candidates_to_be_applied
예제 #7
0
    def fit(self, X, y=None):
        fe = ComplexityDrivenFeatureConstruction(
            None,
            reader=ScikitReader(
                X,
                y,
                feature_names=self.feature_names,
                feature_is_categorical=self.feature_is_categorical),
            score=self.scoring,
            c_max=self.c_max,
            folds=self.cv,
            max_seconds=self.max_time_secs,
            classifier=self.model.__class__,
            grid_search_parameters=self.parameter_grid,
            n_jobs=self.n_jobs,
            epsilon=self.epsilon,
            remove_parents=False,
            transformation_producer=self.transformation_producer)

        fe.run()

        numeric_representations = []
        for r in fe.all_representations:
            if 'score' in r.runtime_properties:
                if not 'object' in str(r.properties['type']):
                    if not isinstance(r.transformation,
                                      MinMaxScalingTransformation):
                        #if not (isinstance(r.transformation, HigherOrderCommutativeTransformation) and r.transformation.method == np.nansum):
                        if isinstance(r.sympy_representation, sympy.Mul):
                            found = False
                            for e in r.sympy_representation._args:
                                if e == S.NegativeOne:
                                    found = True
                            if found == False:
                                numeric_representations.append(r)
                        else:
                            numeric_representations.append(r)

        self.numeric_features = numeric_representations

        my_list = []
        for ff in self.numeric_features:
            my_list.append(str(ff))

        with open('/tmp/names.pickle', 'wb') as f:
            pickle.dump(X, f, pickle.HIGHEST_PROTOCOL)

        all_features = CandidateFeature(IdentityTransformation(-1),
                                        numeric_representations)

        #all_imputation = CandidateFeature(ImputationTransformation(), [all_features])
        all_standardized = CandidateFeature(MinMaxScalingTransformation(),
                                            [all_features])

        #all_standardized = CandidateFeature(MinMaxScalingTransformation(), [all_features])

        self.pipeline_ = all_standardized.pipeline

        self.pipeline_.fit(X, y)
        return self
예제 #8
0
 def get_info_gain_of_feature(self, candidate: CandidateFeature):
     try:
         new_candidate = CandidateFeature(IdentityTransformation(2),
                                          [self.base_features, candidate])
         X = new_candidate.pipeline.fit_transform(
             self.dataset.splitted_values['train'], self.train_y_all_target)
         return mutual_info_classif(X, self.train_y_all_target)[-1]
     except:
         return 0.0
예제 #9
0
    def run(self):
        self.global_starting_time = time.time()

        # generate all candidates
        self.generate(42)
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        myfolds = copy.deepcopy(list(self.preprocessed_folds))

        baseline_features: List[CandidateFeature] = []
        for r in self.raw_features:
            if r.is_numeric() and (not 'categorical' in r.properties
                                   or not r.properties['categorical']):
                if not r.properties['missing_values']:
                    baseline_features.append(r)
                else:
                    baseline_features.append(
                        CandidateFeature(ImputationTransformation(), [r]))
            else:
                baseline_features.extend([
                    CandidateFeature(t, [r])
                    for t in OneHotGenerator(self.train_X_all, [r]).produce()
                ])

        #scale everything
        for bf_i in range(len(baseline_features)):
            baseline_features[bf_i] = CandidateFeature(
                StandardScalingTransformation(), [baseline_features[bf_i]])

        print(len(baseline_features))

        combo = CandidateFeature(
            IdentityTransformation(len(baseline_features)), baseline_features)
        '''
        categorical_ids = []
        for r in self.raw_features:
            if 'categorical' in r.properties and r.properties['categorical']:
                categorical_ids.append(r.column_id)

        combo = CandidateFeature(IdentityTransformation(0), self.raw_features)
        if len(categorical_ids) >= 1:
            combo.pipeline = Pipeline(steps=[('imputation', SimpleImputer(strategy='mean')),
                                         ('onehot', OneHotEncoder(categorical_features=categorical_ids)), ('scaling', StandardScaler(with_mean=False))])
        else:
            combo.pipeline = Pipeline(steps=[('imputation', SimpleImputer(strategy='mean')), ('scaling', StandardScaler(with_mean=False))])
        '''

        results = self.evaluate_candidates([combo], myfolds)

        #print(results[0].runtime_properties)

        #candidate2openml(results[0], self.classifier, self.reader.task, 'RawFeatureBaseline')

        return results[0]
예제 #10
0
    def sisso_transfusion_features_new(self, name2feature):
        sisso_features = []
        sisso_features.extend(self.raw_features)

        sisso_features.append(
            CandidateFeature(NonCommutativeBinaryTransformation(
                np.divide), [name2feature['Frequency'], name2feature['Time']]))

        all_f = CandidateFeature(IdentityTransformation(len(sisso_features)),
                                 sisso_features)
        return [all_f]
    def run(self):
        # generate all candidates
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        print([r.name for r in self.dataset.raw_features])


        plain_attributes = CandidateFeature(IdentityTransformation(len(self.dataset.raw_features)), self.dataset.raw_features)


        self.evaluate_candidates([plain_attributes])
예제 #12
0
    def explorekit_heart_features(self, name2feature):
        explore_kit_features = []
        explore_kit_features.extend(self.raw_features)

        # Discretize({Mean(age) GROUP BY Discretize(sex), Discretize(exercise_induced_angina)})
        discr_sex = CandidateFeature(PandasDiscretizerTransformation(10),
                                     [name2feature['sex']])
        discr_angina = CandidateFeature(
            PandasDiscretizerTransformation(10),
            [name2feature['exercise_induced_angina']])
        grouped = CandidateFeature(GroupByThenTransformation(
            np.mean, 3), [name2feature['age'], discr_sex, discr_angina])
        final = CandidateFeature(PandasDiscretizerTransformation(10),
                                 [grouped])

        explore_kit_features.append(final)

        all_f = CandidateFeature(
            IdentityTransformation(len(explore_kit_features)),
            explore_kit_features)
        return [all_f]
예제 #13
0
    def run(self):
        self.global_starting_time = time.time()

        # generate all candidates
        self.generate(42)
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        myfolds = copy.deepcopy(list(self.preprocessed_folds))

        level_scores: Dict[int, List[float]] = {}
        level_test_scores: Dict[int, List[float]] = {}

        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/eucalyptus')
        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/contraceptive')
        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/diabetes')
        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/credit')
        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/heart_new_all')
        #string2candidate = self.load_data_all('/tmp')

        features = pickle.load(open('/tmp/cover_features.p', "rb"))

        #apply minmax scaling
        new_features: List[CandidateFeature] = []
        for f in features:
            new_features.append(
                CandidateFeature(MinMaxScalingTransformation(), [f]))

        results = self.evaluate_candidates([
            CandidateFeature(IdentityTransformation(len(new_features)),
                             new_features)
        ], myfolds)

        print(results[0])
        print(results[0].runtime_properties)

        return results[0]
예제 #14
0
파일: Sisso.py 프로젝트: BigDaMa/DFS
    def run(self):
        self.global_starting_time = time.time()

        # generate all candidates
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        all_f = CandidateFeature(
            IdentityTransformation(len(self.raw_features)), self.raw_features)

        feature_names = [str(r) for r in self.raw_features]

        t = CandidateFeature(
            SissoTransformer(len(self.raw_features), feature_names,
                             ["^2", "^3", "1/"]), [all_f])

        t.pipeline.fit(self.dataset.splitted_values['train'],
                       self.train_y_all_target)
        X = t.transform(self.dataset.splitted_values['train'])
        X_test = t.transform(self.dataset.splitted_values['test'])

        print("time: " + str(time.time() - self.global_starting_time))

        clf = GridSearchCV(self.classifier(),
                           self.grid_search_parameters,
                           cv=self.preprocessed_folds,
                           scoring=self.score,
                           iid=False,
                           error_score='raise')
        clf.fit(X, self.train_y_all_target)

        print(X_test)

        print('test score: ' + str(clf.score(X_test, self.test_target)))
        print("\n\n")
예제 #15
0
my_names: List[CandidateFeature] = pickle.load(
    open(
        "/home/felix/phd/feature_constraints/" + str(which_experiment) +
        "/names.p", "rb"))
print(my_names)

X_train = pickle.load(
    open(
        "/home/felix/phd/feature_constraints/" + str(which_experiment) +
        "/X_train.p", "rb"))
y_train = pickle.load(
    open(
        "/home/felix/phd/feature_constraints/" + str(which_experiment) +
        "/y_train.p", "rb"))

all_features = CandidateFeature(IdentityTransformation(-1),
                                numeric_representations)
all_standardized = CandidateFeature(MinMaxScalingTransformation(),
                                    [all_features])

foreigner = np.array(X_train[:, 7])
gender = np.array(
    ['female' in personal_status for personal_status in X_train[:, 15]])

my_runner = Runner(c=1.0, sensitive=gender, labels=['bad', 'good'])
#my_runner = Runner(c=1.0, sensitive=foreigner, labels=['bad', 'good'])

model = xgb.XGBClassifier(objective="binary:logistic",
                          n_estimators=1000,
                          random_state=42)
예제 #16
0
    def run(self):

        self.global_starting_time = time.time()

        # generate all candidates
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        unary_transformations, binary_transformations = self.transformation_producer()



        cost_2_raw_features: Dict[int, List[CandidateFeature]] = {}
        cost_2_unary_transformed: Dict[int, List[CandidateFeature]] = {}
        cost_2_binary_transformed: Dict[int, List[CandidateFeature]] = {}
        cost_2_combination: Dict[int, List[CandidateFeature]] = {}

        cost_2_dropped_evaluated_candidates: Dict[int, List[CandidateFeature]] = {}

        complexity_delta = 1.0

        epsilon = self.epsilon
        limit_runs = self.c_max + 1  # 5
        unique_raw_combinations = False


        baseline_score = 0.0#self.evaluate_candidates([CandidateFeature(DummyOneTransformation(None), [self.raw_features[0]])])[0]['score']
        #print("baseline: " + str(baseline_score))


        max_feature = CandidateFeature(IdentityTransformation(None), [self.raw_features[0]])
        max_feature.runtime_properties['score'] = -2

        self.name_to_transfomed = {}

        for c in range(1, limit_runs):
            current_layer: List[CandidateFeature] = []

            #0th
            if c == 1:
                cost_2_raw_features[c]: List[CandidateFeature] = []
                for raw_f in self.raw_features:
                    if raw_f.is_numeric():
                        current_layer.append(raw_f)
                    else:
                        raw_f.runtime_properties['score'] = 0.0
                        cost_2_raw_features[c].append(raw_f)

            # first unary
            # we apply all unary transformation to all c-1 in the repo (except combinations and other unary?)
            unary_candidates_to_be_applied: List[CandidateFeature] = []
            if (c - 1) in cost_2_raw_features:
                unary_candidates_to_be_applied.extend(cost_2_raw_features[c - 1])
            if (c - 1) in cost_2_unary_transformed:
                unary_candidates_to_be_applied.extend(cost_2_unary_transformed[c - 1])
            if (c - 1) in cost_2_binary_transformed:
                unary_candidates_to_be_applied.extend(cost_2_binary_transformed[c - 1])


            current_layer.extend(self.generate_features(unary_transformations, unary_candidates_to_be_applied))

            #second binary
            #get length 2 partitions for current cost
            partition = self.get_length_2_partition(c-1)
            #print("bin: c: " + str(c) + " partition" + str(partition))

            #apply cross product from partitions
            binary_candidates_to_be_applied: List[CandidateFeature] = []
            for p in partition:
                lists_for_each_element: List[List[CandidateFeature]] = [[], []]
                for element in range(2):
                    if p[element] in cost_2_raw_features:
                        lists_for_each_element[element].extend(cost_2_raw_features[p[element]])
                    if p[element] in cost_2_unary_transformed:
                        lists_for_each_element[element].extend(cost_2_unary_transformed[p[element]])
                    if p[element] in cost_2_binary_transformed:
                        lists_for_each_element[element].extend(cost_2_binary_transformed[p[element]])

                for bt in binary_transformations:
                    list_of_combinations = self.generate_merge(lists_for_each_element[0], lists_for_each_element[1], bt.parent_feature_order_matters, bt.parent_feature_repetition_is_allowed)
                    for combo in list_of_combinations:
                        if bt.is_applicable(combo):
                            binary_candidates_to_be_applied.append(CandidateFeature(copy.deepcopy(bt), combo))
            current_layer.extend(binary_candidates_to_be_applied)

            #third: feature combinations
            #first variant: treat combination as a transformation
            #therefore, we can use the same partition as for binary data
            partition = self.get_length_2_partition(c)
            #print("combo c: " + str(c) + " partition" + str(partition))

            combinations_to_be_applied: List[CandidateFeature] = []
            for p in partition:
                lists_for_each_element: List[List[CandidateFeature]] = [[], []]
                for element in range(2):
                    if p[element] in cost_2_raw_features:
                        lists_for_each_element[element].extend(cost_2_raw_features[p[element]])
                    if p[element] in cost_2_unary_transformed:
                        lists_for_each_element[element].extend(cost_2_unary_transformed[p[element]])
                    if p[element] in cost_2_binary_transformed:
                        lists_for_each_element[element].extend(cost_2_binary_transformed[p[element]])
                    if p[element] in cost_2_combination:
                        lists_for_each_element[element].extend(cost_2_combination[p[element]])


                list_of_combinations = self.generate_merge_for_combination(lists_for_each_element[0], lists_for_each_element[1])
                for combo in list_of_combinations:
                    if IdentityTransformation(None).is_applicable(list(combo)):
                        combinations_to_be_applied.append(CandidateFeature(IdentityTransformation(None), list(combo)))
            current_layer.extend(combinations_to_be_applied)



            if unique_raw_combinations:
                length = len(current_layer)
                current_layer = self.filter_non_unique_combinations(current_layer)
                print("From " + str(length) + " combinations, we filter " +  str(length - len(current_layer)) + " nonunique raw feature combinations.")



            #now evaluate all from this layer
            #print(current_layer)
            print("----------- Evaluation of " + str(len(current_layer)) + " representations -----------")
            results = self.evaluate_candidates(current_layer)
            print("----------- Evaluation Finished -----------")

            layer_end_time = time.time() - self.global_starting_time

            #calculate whether we drop the evaluated candidate
            for result in results:
                candidate: CandidateFeature = result['candidate']
                candidate.runtime_properties['score'] = result['score']
                candidate.runtime_properties['test_score'] = result['test_score']
                candidate.runtime_properties['execution_time'] = result['execution_time']
                candidate.runtime_properties['global_time'] = result['global_time']
                candidate.runtime_properties['hyperparameters'] = result['hyperparameters']
                candidate.runtime_properties['layer_end_time'] = layer_end_time

                #print(str(candidate) + " -> " + str(candidate.score))

                if candidate.runtime_properties['score'] > max_feature.runtime_properties['score']:
                    max_feature = candidate

                #calculate original score
                original_score = baseline_score #or zero??
                if not isinstance(candidate, RawFeature):
                    original_score = max([p.runtime_properties['score'] for p in candidate.parents])

                accuracy_delta = result['score'] - original_score

                if accuracy_delta / complexity_delta > epsilon:
                    if isinstance(candidate, RawFeature):
                        if not c in cost_2_raw_features:
                            cost_2_raw_features[c]: List[CandidateFeature] = []
                        cost_2_raw_features[c].append(candidate)
                    elif isinstance(candidate.transformation, UnaryTransformation):
                        if not c in cost_2_unary_transformed:
                            cost_2_unary_transformed[c]: List[CandidateFeature] = []
                        cost_2_unary_transformed[c].append(candidate)
                    elif isinstance(candidate.transformation, IdentityTransformation):
                        if not c in cost_2_combination:
                            cost_2_combination[c]: List[CandidateFeature] = []
                        cost_2_combination[c].append(candidate)
                    else:
                        if not c in cost_2_binary_transformed:
                            cost_2_binary_transformed[c]: List[CandidateFeature] = []
                        cost_2_binary_transformed[c].append(candidate)
                else:
                    if not c in cost_2_dropped_evaluated_candidates:
                        cost_2_dropped_evaluated_candidates[c]: List[CandidateFeature] = []
                    cost_2_dropped_evaluated_candidates[c].append(candidate)
            


            if c in cost_2_dropped_evaluated_candidates:
                print("Of " + str(len(current_layer)) + " candidate representations, " + str(len(cost_2_dropped_evaluated_candidates[c])) + " did not satisfy the epsilon threshold.")
            else:
                print("Of " + str(len(current_layer)) + " candidate representations, all satisfied the epsilon threshold.")


            print("Best representation found for complexity = " + str(c) + ": " + str(max_feature) + "\n")

            if self.save_logs:
                pickle.dump(cost_2_raw_features, open(Config.get_default("tmp.folder", "/tmp") + "/data_raw.p", "wb"))
                pickle.dump(cost_2_unary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_unary.p", "wb"))
                pickle.dump(cost_2_binary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_binary.p", "wb"))
                pickle.dump(cost_2_combination, open(Config.get_default("tmp.folder", "/tmp") + "/data_combination.p", "wb"))
                pickle.dump(cost_2_dropped_evaluated_candidates, open(Config.get_default("tmp.folder", "/tmp") + "/data_dropped.p", "wb"))
예제 #17
0
    def run_pipeline(self, which_features_to_use, runs=1):
        results = {}

        start_time = time.time()

        # generate pipeline
        results['complexity'] = 0
        all_selected_features = []
        for i in range(len(which_features_to_use)):
            if which_features_to_use[i]:
                all_selected_features.append(self.numeric_representations[i])
                results['complexity'] += self.numeric_representations[
                    i].get_complexity()

        all_features = CandidateFeature(IdentityTransformation(-1),
                                        all_selected_features)
        all_standardized = CandidateFeature(MinMaxScalingTransformation(),
                                            [all_features])

        my_pipeline = Pipeline([('f', all_standardized.pipeline),
                                ('c', self.model())])

        cv_scores = []
        test_scores = []
        pred_test = None
        proba_pred_test = None

        if runs > 1:
            for r in range(runs):
                kfolds = StratifiedKFold(10, shuffle=True, random_state=42 + r)
                self.pipeline = GridSearchCV(my_pipeline,
                                             self.parameter_grid,
                                             cv=kfolds.split(
                                                 self.X_train, self.y_train),
                                             scoring=self.scoring,
                                             n_jobs=4)
                self.pipeline.fit(self.X_train, self.y_train)

                pred_test = self.pipeline.predict(self.X_test)
                proba_pred_test = self.pipeline.predict_proba(self.X_test)

                test_auc = self.auc(self.pipeline, self.X_test, self.y_test)

                cv_scores.append(self.pipeline.best_score_)
                test_scores.append(test_auc)

            std_loss = np.std(cv_scores)
            loss = np.average(cv_scores)
        else:
            kfolds = StratifiedKFold(10, shuffle=True, random_state=42)
            self.pipeline = GridSearchCV(my_pipeline,
                                         self.parameter_grid,
                                         cv=kfolds.split(
                                             self.X_train, self.y_train),
                                         scoring=self.scoring,
                                         n_jobs=1,
                                         refit='auc')
            self.pipeline.fit(self.X_train, pd.DataFrame(self.y_train))

            pred_test = self.pipeline.predict(self.X_test)
            proba_pred_test = self.pipeline.predict_proba(self.X_test)

            test_auc = make_scorer(roc_auc_score,
                                   greater_is_better=True,
                                   needs_threshold=True)(self.pipeline,
                                                         self.X_test,
                                                         self.y_test)

            for k in self.scoring.keys():
                results[k] = self.pipeline.cv_results_['mean_test_' + str(k)][
                    self.pipeline.best_index_]

            loss = self.pipeline.cv_results_['mean_test_auc'][
                self.pipeline.best_index_]
            test_scores.append(test_auc)

        results['test_auc'] = np.average(test_scores)

        results['cv_time'] = time.time() - start_time
        results['global_time'] = time.time() - self.global_starting_time

        return results  #loss, np.average(test_scores), pred_test, 0.0, proba_pred_test
    def run(self):

        self.global_starting_time = time.time()

        # generate all candidates
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        unary_transformations, binary_transformations = self.transformation_producer(self.train_X_all, self.raw_features)



        cost_2_raw_features: Dict[int, List[CandidateFeature]] = {}
        cost_2_unary_transformed: Dict[int, List[CandidateFeature]] = {}
        cost_2_binary_transformed: Dict[int, List[CandidateFeature]] = {}
        cost_2_combination: Dict[int, List[CandidateFeature]] = {}

        if self.save_logs:
            cost_2_dropped_evaluated_candidates: Dict[int, List[CandidateFeature]] = {}

        self.complexity_delta = 1.0

        unique_raw_combinations = False


        baseline_score = 0.0#self.evaluate_candidates([CandidateFeature(DummyOneTransformation(None), [self.raw_features[0]])])[0]['score']
        #print("baseline: " + str(baseline_score))


        max_feature = CandidateFeature(IdentityTransformation(None), [self.raw_features[0]])
        max_feature.runtime_properties['score'] = -float("inf")

        max_feature_per_complexity: Dict[int, CandidateFeature] = {}

        all_evaluated_features = set()

        my_globale_module.global_starting_time_global = copy.deepcopy(self.global_starting_time)
        my_globale_module.grid_search_parameters_global = copy.deepcopy(self.grid_search_parameters)
        my_globale_module.score_global = copy.deepcopy(self.score)
        my_globale_module.classifier_global = copy.deepcopy(self.classifier)
        my_globale_module.target_train_folds_global = copy.deepcopy(self.target_train_folds)
        my_globale_module.target_test_folds_global = copy.deepcopy(self.target_test_folds)
        my_globale_module.train_y_all_target_global = copy.deepcopy(self.train_y_all_target)
        my_globale_module.test_target_global = copy.deepcopy(self.test_target)
        my_globale_module.max_timestamp_global = copy.deepcopy(self.max_timestamp)
        my_globale_module.preprocessed_folds_global = copy.deepcopy(self.preprocessed_folds)
        my_globale_module.epsilon_global = copy.deepcopy(self.epsilon)
        my_globale_module.complexity_delta_global = copy.deepcopy(self.complexity_delta)
        my_globale_module.remove_parents = copy.deepcopy(self.remove_parents)

        my_globale_module.materialized_set = set()
        my_globale_module.predictions_set = set()

        number_of_multiple_cvs = 10
        nested_my_globale_module.splitting_seeds = np.random.randint(low=0, high=10000, size=number_of_multiple_cvs)
        nested_my_globale_module.model_seeds = np.random.randint(low=0, high=10000, size=number_of_multiple_cvs)

        #pickle.dump(my_globale_module.target_test_folds_global, open('/tmp/test_groundtruth.p', 'wb+'))


        c = 1
        while(True):
            current_layer: List[CandidateFeature] = []

            if c <= self.max_feature_depth:
                #0th
                if c == 1:
                    cost_2_raw_features[c]: List[CandidateFeature] = []
                    #print(self.raw_features)
                    for raw_f in self.raw_features:
                        sympy_representation = sympy.Symbol('X' + str(raw_f.column_id))
                        raw_f.sympy_representation = sympy_representation
                        all_evaluated_features.add(sympy_representation)
                        if raw_f.is_numeric():
                            if raw_f.properties['missing_values']:
                                raw_f.runtime_properties['score'] = 0.0
                                cost_2_raw_features[c].append(raw_f)
                            else:
                                current_layer.append(raw_f)
                            #print("numeric: " + str(raw_f))
                        else:
                            raw_f.runtime_properties['score'] = 0.0
                            cost_2_raw_features[c].append(raw_f)
                            #print("nonnumeric: " + str(raw_f))

                        self.materialize_raw_features(raw_f)
                        #raw_f.derive_properties(raw_f.runtime_properties['train_transformed'][0])

                # first unary
                # we apply all unary transformation to all c-1 in the repo (except combinations and other unary?)
                unary_candidates_to_be_applied: List[CandidateFeature] = []
                if (c - 1) in cost_2_raw_features:
                    unary_candidates_to_be_applied.extend(cost_2_raw_features[c - 1])
                if (c - 1) in cost_2_unary_transformed:
                    unary_candidates_to_be_applied.extend(cost_2_unary_transformed[c - 1])
                if (c - 1) in cost_2_binary_transformed:
                    unary_candidates_to_be_applied.extend(cost_2_binary_transformed[c - 1])

                all_unary_features = self.generate_features(unary_transformations, unary_candidates_to_be_applied, all_evaluated_features)
                current_layer.extend(all_unary_features)

                #second binary
                #get length 2 partitions for current cost
                partition = self.get_length_2_partition(c-1)
                #print("bin: c: " + str(c) + " partition" + str(partition))

                #apply cross product from partitions
                binary_candidates_to_be_applied: List[CandidateFeature] = []
                for p in partition:
                    lists_for_each_element: List[List[CandidateFeature]] = [[], []]
                    for element in range(2):
                        if p[element] in cost_2_raw_features:
                            lists_for_each_element[element].extend(cost_2_raw_features[p[element]])
                        if p[element] in cost_2_unary_transformed:
                            lists_for_each_element[element].extend(cost_2_unary_transformed[p[element]])
                        if p[element] in cost_2_binary_transformed:
                            lists_for_each_element[element].extend(cost_2_binary_transformed[p[element]])

                    for bt in binary_transformations:
                        list_of_combinations = self.generate_merge(lists_for_each_element[0], lists_for_each_element[1], bt.parent_feature_order_matters, bt.parent_feature_repetition_is_allowed)
                        #print(list_of_combinations)
                        for combo in list_of_combinations:
                            if bt.is_applicable(combo):
                                sympy_representation = bt.get_sympy_representation(
                                    [p.get_sympy_representation() for p in combo])
                                try:
                                    if len(sympy_representation.free_symbols) > 0:  # if expression is not constant
                                        if not sympy_representation in all_evaluated_features:
                                            bin_candidate = CandidateFeature(copy.deepcopy(bt), combo)
                                            bin_candidate.sympy_representation = copy.deepcopy(sympy_representation)
                                            all_evaluated_features.add(sympy_representation)
                                            binary_candidates_to_be_applied.append(bin_candidate)
                                        else:
                                            #print(str(bin_candidate) + " skipped: " + str(sympy_representation))
                                            pass
                                    else:
                                        #print(str(bin_candidate) + " skipped: " + str(sympy_representation))
                                        pass
                                except:
                                    pass
                current_layer.extend(binary_candidates_to_be_applied)

            #third: feature combinations
            #first variant: treat combination as a transformation
            #therefore, we can use the same partition as for binary data
            partition = self.get_length_2_partition(c)
            #print("combo c: " + str(c) + " partition" + str(partition))


            def filter_minus(features: List[CandidateFeature]):
                filtered_features: List[CandidateFeature] = []
                if my_globale_module.classifier_global == LogisticRegression:
                    for check_f in features:
                        if not isinstance(check_f.transformation, MinusTransformation):
                            filtered_features.append(check_f)
                return filtered_features

            '''
            combinations_to_be_applied: List[CandidateFeature] = []
            for p in partition:
                lists_for_each_element: List[List[CandidateFeature]] = [[], []]
                for element in range(2):
                    if p[element] in cost_2_raw_features:
                        lists_for_each_element[element].extend(cost_2_raw_features[p[element]])
                    if p[element] in cost_2_unary_transformed:
                        lists_for_each_element[element].extend(filter_minus(cost_2_unary_transformed[p[element]]))
                    if p[element] in cost_2_binary_transformed:
                        lists_for_each_element[element].extend(filter_minus(cost_2_binary_transformed[p[element]]))
                    if p[element] in cost_2_combination:
                        lists_for_each_element[element].extend(cost_2_combination[p[element]])

                combinations_to_be_applied = self.generate_merge_for_combination(all_evaluated_features, lists_for_each_element[0], lists_for_each_element[1])
            current_layer.extend(combinations_to_be_applied)
            '''



            if unique_raw_combinations:
                length = len(current_layer)
                current_layer = self.filter_non_unique_combinations(current_layer)
                print("From " + str(length) + " combinations, we filter " +  str(length - len(current_layer)) + " nonunique raw feature combinations.")



            #now evaluate all from this layer
            #print(current_layer)

            print("----------- Evaluation of " + str(len(current_layer)) + " representations -----------")
            results = evaluate_candidates_parallel(current_layer, self.n_jobs)
            print("----------- Evaluation Finished -----------")





            ##nested cv
            '''
            new_results_with_nested = []
            for r_result in results:
                if type(r_result) != type(None):
                    new_results_with_nested.append(r_result)
            #results = nested_cv_score_parallel(new_results_with_nested, self.reader.splitted_values['train'], self.reader.splitted_target['train'])
            results = multiple_cv_score_parallel(new_results_with_nested, self.reader.splitted_values['train'], self.reader.splitted_target['train'])
            for r_result in results:
                #print(str(r_result) + ' cv: ' + str(r_result.runtime_properties['score']) + ' test: ' + str(r_result.runtime_properties['test_score']) + ' nested: ' + str(r_result.runtime_properties['nested_cv_score']))
                print(str(r_result) + ' cv: ' + str(r_result.runtime_properties['score']) + ' test: ' + str(
                    r_result.runtime_properties['test_score']) + ' nested: ' + str(
                    r_result.runtime_properties['multiple_cv_score']))
            '''


            #print(results)

            layer_end_time = time.time() - self.global_starting_time

            #calculate whether we drop the evaluated candidate
            for candidate in results:

                ## check if we computed an equivalent feature before
                if type(candidate) != type(None) and not isinstance(candidate.transformation, IdentityTransformation):
                    materialized_all = []
                    for fold_ii in range(len(my_globale_module.preprocessed_folds_global)):
                        materialized_all.extend(candidate.runtime_properties['test_transformed'][fold_ii].flatten())
                    materialized = tuple(materialized_all)
                    if materialized in my_globale_module.materialized_set:
                        candidate = None
                    else:
                        my_globale_module.materialized_set.add(materialized)

                '''
                ## check if predictions exist already
                if type(candidate) != type(None) and 'test_fold_predictions' in candidate.runtime_properties:
                    materialized_all = []
                    for fold_ii in range(len(my_globale_module.preprocessed_folds_global)):
                        materialized_all.extend(candidate.runtime_properties['test_fold_predictions'][fold_ii].flatten())
                    materialized = tuple(materialized_all)
                    if materialized in my_globale_module.predictions_set:
                        candidate = None
                    else:
                        my_globale_module.predictions_set.add(materialized)
                '''



                if type(candidate) != type(None):
                    candidate.runtime_properties['layer_end_time'] = layer_end_time

                    #print(str(candidate) + " -> " + str(candidate.runtime_properties['score']))


                    if candidate.runtime_properties['score'] > max_feature.runtime_properties['score']:
                        max_feature = candidate

                    if candidate.runtime_properties['passed']:

                        if isinstance(candidate, RawFeature):
                            if not c in cost_2_raw_features:
                                cost_2_raw_features[c]: List[CandidateFeature] = []
                            cost_2_raw_features[c].append(candidate)
                        elif isinstance(candidate.transformation, UnaryTransformation):
                            if not c in cost_2_unary_transformed:
                                cost_2_unary_transformed[c]: List[CandidateFeature] = []
                            cost_2_unary_transformed[c].append(candidate)
                        elif isinstance(candidate.transformation, IdentityTransformation):
                            if not c in cost_2_combination:
                                cost_2_combination[c]: List[CandidateFeature] = []
                            cost_2_combination[c].append(candidate)
                        else:
                            if not c in cost_2_binary_transformed:
                                cost_2_binary_transformed[c]: List[CandidateFeature] = []
                            cost_2_binary_transformed[c].append(candidate)
                    else:
                        if self.save_logs:
                            if not c in cost_2_dropped_evaluated_candidates:
                                cost_2_dropped_evaluated_candidates[c]: List[CandidateFeature] = []
                            cost_2_dropped_evaluated_candidates[c].append(candidate)
            


            satisfied_count = 0
            if c in cost_2_raw_features:
                satisfied_count += len(cost_2_raw_features[c])
            if c in cost_2_unary_transformed:
                satisfied_count += len(cost_2_unary_transformed[c])
            if c in cost_2_binary_transformed:
                satisfied_count += len(cost_2_binary_transformed[c])
            if c in cost_2_combination:
                satisfied_count += len(cost_2_combination[c])

            all_count = len(current_layer)
            if c == 1:
                all_count = len(cost_2_raw_features[c])


            print("Of " + str(all_count) + " candidate representations, " + str(satisfied_count) + " did satisfy the epsilon threshold.")


            if len(current_layer) > 0:
                if 'test_score' in max_feature.runtime_properties:
                    print("\nBest representation found for complexity = " + str(c) + ": " + str(max_feature) + "\nmean cross-validation score: " + "{0:.2f}".format(max_feature.runtime_properties['score']) + ", score on test: " + "{0:.2f}".format(max_feature.runtime_properties['test_score']) + "\n")
                else:
                    print("\nBest representation found for complexity = " + str(c) + ": " + str(
                        max_feature) + "\nmean cross-validation score: " + "{0:.2f}".format(
                        max_feature.runtime_properties['score']) + "\n")
                #print("hyper: " + str(max_feature.runtime_properties['hyperparameters']))

                #print(max_feature.runtime_properties['fold_scores'])

            # upload best feature to OpenML
            if self.upload2openml:
                candidate2openml(max_feature, my_globale_module.classifier_global, self.reader.task, 'ComplexityDriven')


            if self.save_logs:
                try:
                    pickle.dump(cost_2_raw_features, open(Config.get_default("tmp.folder", "/tmp") + "/data_raw" + str(self.reader.rotate_test) + ".p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                    pickle.dump(cost_2_unary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_unary" + str(self.reader.rotate_test) + ".p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                    pickle.dump(cost_2_binary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_binary" + str(self.reader.rotate_test) + ".p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                    pickle.dump(cost_2_combination, open(Config.get_default("tmp.folder", "/tmp") + "/data_combination" + str(self.reader.rotate_test) + ".p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                    pickle.dump(cost_2_dropped_evaluated_candidates, open(Config.get_default("tmp.folder", "/tmp") + "/data_dropped" + str(self.reader.rotate_test) + ".p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                except:
                    pickle.dump(cost_2_raw_features, open(
                        Config.get_default("tmp.folder", "/tmp") + "/data_raw.p", "wb"),
                                protocol=pickle.HIGHEST_PROTOCOL)
                    pickle.dump(cost_2_unary_transformed, open(
                        Config.get_default("tmp.folder", "/tmp") + "/data_unary.p",
                        "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                    pickle.dump(cost_2_binary_transformed, open(
                        Config.get_default("tmp.folder", "/tmp") + "/data_binary.p",
                        "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                    pickle.dump(cost_2_combination, open(
                        Config.get_default("tmp.folder", "/tmp") + "/data_combination.p",
                        "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                    pickle.dump(cost_2_dropped_evaluated_candidates, open(
                        Config.get_default("tmp.folder", "/tmp") + "/data_dropped.p",
                        "wb"), protocol=pickle.HIGHEST_PROTOCOL)


            max_feature_per_complexity[c] = max_feature


            if type(self.c_max) == type(None) and c > 2:
                # calculate harmonic mean
                harmonic_means = [0.0]*3
                for h_i in range(len(harmonic_means)):
                    simplicity_cum_score = self.getSimplicityScore(max_feature_per_complexity[c-h_i].get_complexity(), c,
                                                                       cost_2_raw_features, cost_2_unary_transformed,
                                                                       cost_2_binary_transformed, cost_2_combination)
                    accuracy_cum_score = self.getAccuracyScore(max_feature_per_complexity[c-h_i].runtime_properties['score'], c,
                                                                   cost_2_raw_features, cost_2_unary_transformed,
                                                                   cost_2_binary_transformed, cost_2_combination)

                    harmonic_means[h_i] = self.harmonic_mean(simplicity_cum_score, accuracy_cum_score)
                    #print(str(max_feature_per_complexity[c-h_i]) + ": " + str(harmonic_means[h_i]) + " h: " + str(h_i))

                if harmonic_means[2] >= harmonic_means[1] and harmonic_means[2] >= harmonic_means[0]:
                    print("Best Harmonic Mean: " + str(max_feature_per_complexity[c-2]))
                    break


            if type(self.max_timestamp) != type(None) and time.time() >= self.max_timestamp:
                break

            c += 1

            if type(self.c_max) != type(None) and self.c_max < c:
                break





        def extend_all(all_representations: List[CandidateFeature], new_llist):
            for mylist in new_llist:
                all_representations.extend(mylist)

        #get all representation
        all_representations: List[CandidateFeature] = []
        extend_all(all_representations, cost_2_raw_features.values())
        extend_all(all_representations, cost_2_unary_transformed.values())
        extend_all(all_representations, cost_2_binary_transformed.values())
        extend_all(all_representations, cost_2_combination.values())

        self.all_representations = all_representations

        '''

        #find top k based on cv score
        scores = [c.runtime_properties['score'] for c in all_representations]
        sorted_cv_score_ids = np.argsort(np.array(scores)*-1)
        checking_k = 50
        top_k_representations = [all_representations[sorted_id] for sorted_id in sorted_cv_score_ids[0:checking_k]]

        #from top k - select best based on nested cv score
        top_k_representations = multiple_cv_score_parallel(top_k_representations, self.reader.splitted_values['train'],
                                           self.reader.splitted_target['train'])

        scores = [c.runtime_properties['multiple_cv_score'] for c in top_k_representations]

        max_nested_cv_score = -1
        max_nested_rep = None
        for eval_candidate in top_k_representations:
            if eval_candidate.runtime_properties['multiple_cv_score'] > max_nested_cv_score:
                max_nested_cv_score = eval_candidate.runtime_properties['multiple_cv_score']
                max_nested_rep = eval_candidate

        print(max_nested_rep)
        max_feature = max_nested_rep
        '''

        '''
        all_features = list(max_feature_per_complexity.values())
        all_features = multiple_cv_score_parallel(all_features, self.reader.splitted_values['train'], self.reader.splitted_target['train'])

        best_multiple_cv_score = -np.inf
        best_multiple_cv_candidate = None
        for all_f in all_features:
            if all_f.runtime_properties['multiple_cv_score'] > best_multiple_cv_score:
                best_multiple_cv_score = all_f.runtime_properties['multiple_cv_score']
                best_multiple_cv_candidate = all_f

        #find the most simple representation that is within the best representation's std
        complexities = [all_f.get_complexity() for all_f in all_features]
        ids_complex = np.argsort(complexities)
        for all_f_i in range(len(all_features)):
            print(str(all_features[ids_complex[all_f_i]]) + ' mcv: ' + str(all_features[ids_complex[all_f_i]].runtime_properties['multiple_cv_score']) + ' mcv_std: ' + str(
                all_features[ids_complex[all_f_i]].runtime_properties['multiple_cv_score_std']))

            if all_features[ids_complex[all_f_i]].runtime_properties['multiple_cv_score'] > best_multiple_cv_candidate.runtime_properties['multiple_cv_score'] - best_multiple_cv_candidate.runtime_properties['multiple_cv_score_std']:
                max_feature = all_features[ids_complex[all_f_i]]
                break

        print(max_feature)
        '''

        #min AICc selection
        min_aicc = np.inf
        min_aicc_feature = None

        all_aiccs = []
        for rep in list(max_feature_per_complexity.values()):
            all_aiccs.append(np.mean(rep.runtime_properties['additional_metrics']['AICc_complexity']))

        def calculate_AIC_for_classification_paper(rss, n, k):
            AIC = 2 * k + float(n) * np.log(rss / float(n))
            return AIC

        def calculate_AICc_for_classification_paper(rss, n, k):
            AIC = calculate_AIC_for_classification_paper(rss, n, k)
            AICc = AIC + ((2 * k * (k + 1)) / (n - k - 1))
            return AICc


        def calc_global_aicc(rep):
            return calculate_AICc_for_classification_paper(np.sum(rep.runtime_properties['additional_metrics']['rss']), np.sum(rep.runtime_properties['additional_metrics']['n']), rep.get_complexity())

        def is_better(old_aics, new_aics):
            print(np.sum(np.array(new_aics) < np.array(old_aics)))
            return np.sum(np.array(new_aics) < np.array(old_aics)) > len(new_aics) / 2.0

        for rep in list(max_feature_per_complexity.values()):
            curr = np.mean(rep.runtime_properties['additional_metrics']['AICc_complexity'])
            #print(str(rep) + ': ' + str(curr) + ' AICc min: ' + str(np.min(rep.runtime_properties['additional_metrics']['AICc_complexity'])) + ' AICc std: ' + str(np.std(rep.runtime_properties['additional_metrics']['AICc_complexity'])) + ' P: ' + str(np.exp((min(all_aiccs) - curr)/2)) + ' CV AUC: ' + str(rep.runtime_properties['score']))
            print(str(rep) + ':' + str(rep.runtime_properties['additional_metrics']['AICc_complexity']))
            print(str(rep) + ':' + str(rep.runtime_properties['additional_metrics']['rss']))
            print(str(rep) + ':' + str(rep.runtime_properties['additional_metrics']['n']))

            print(str(rep) + 'global_aicc: ' + str(calc_global_aicc(rep)))

            #if type(min_aicc_feature) == type(None) or is_better(min_aicc_feature.runtime_properties['additional_metrics']['AICc_complexity'], rep.runtime_properties['additional_metrics']['AICc_complexity']):
            if type(min_aicc_feature) == type(None) or calc_global_aicc(rep) < calc_global_aicc(min_aicc_feature):
                #min_aicc = np.min(rep.runtime_properties['additional_metrics']['AICc_complexity'])
                min_aicc_feature = rep
        max_feature = min_aicc_feature

        print(max_feature)

        return max_feature
예제 #19
0
    def run(self):

        self.global_starting_time = time.time()

        # generate all candidates
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        unary_transformations, binary_transformations = self.transformation_producer(self.train_X_all, self.raw_features)



        cost_2_raw_features: Dict[int, List[CandidateFeature]] = {}
        cost_2_unary_transformed: Dict[int, List[CandidateFeature]] = {}
        cost_2_binary_transformed: Dict[int, List[CandidateFeature]] = {}
        cost_2_combination: Dict[int, List[CandidateFeature]] = {}

        if self.save_logs:
            cost_2_dropped_evaluated_candidates: Dict[int, List[CandidateFeature]] = {}

        self.complexity_delta = 1.0

        unique_raw_combinations = False


        baseline_score = 0.0#self.evaluate_candidates([CandidateFeature(DummyOneTransformation(None), [self.raw_features[0]])])[0]['score']
        #print("baseline: " + str(baseline_score))


        max_feature = CandidateFeature(IdentityTransformation(None), [self.raw_features[0]])
        max_feature.runtime_properties['score'] = -float("inf")

        max_feature_per_complexity: Dict[int, CandidateFeature] = {}

        all_evaluated_features = set()

        my_globale_module.global_starting_time_global = copy.deepcopy(self.global_starting_time)
        my_globale_module.grid_search_parameters_global = copy.deepcopy(self.grid_search_parameters)
        my_globale_module.score_global = copy.deepcopy(self.score)
        my_globale_module.classifier_global = copy.deepcopy(self.classifier)
        my_globale_module.target_train_folds_global = copy.deepcopy(self.target_train_folds)
        my_globale_module.target_test_folds_global = copy.deepcopy(self.target_test_folds)
        my_globale_module.train_y_all_target_global = copy.deepcopy(self.train_y_all_target)
        my_globale_module.test_target_global = copy.deepcopy(self.test_target)
        my_globale_module.max_timestamp_global = copy.deepcopy(self.max_timestamp)
        my_globale_module.preprocessed_folds_global = copy.deepcopy(self.preprocessed_folds)
        my_globale_module.epsilon_global = copy.deepcopy(self.epsilon)
        my_globale_module.complexity_delta_global = copy.deepcopy(self.complexity_delta)
        my_globale_module.remove_parents = copy.deepcopy(self.remove_parents)





        c = 1
        while(True):
            current_layer: List[CandidateFeature] = []

            #0th
            if c == 1:
                cost_2_raw_features[c]: List[CandidateFeature] = []
                #print(self.raw_features)
                for raw_f in self.raw_features:
                    sympy_representation = sympy.Symbol('X' + str(raw_f.column_id))
                    raw_f.sympy_representation = sympy_representation
                    all_evaluated_features.add(sympy_representation)
                    if raw_f.is_numeric():
                        if raw_f.properties['missing_values']:
                            raw_f.runtime_properties['score'] = 0.0
                            cost_2_raw_features[c].append(raw_f)
                        else:
                            current_layer.append(raw_f)
                        #print("numeric: " + str(raw_f))
                    else:
                        raw_f.runtime_properties['score'] = 0.0
                        cost_2_raw_features[c].append(raw_f)
                        #print("nonnumeric: " + str(raw_f))

                    self.materialize_raw_features(raw_f)
                    #raw_f.derive_properties(raw_f.runtime_properties['train_transformed'][0])

            # first unary
            # we apply all unary transformation to all c-1 in the repo (except combinations and other unary?)
            unary_candidates_to_be_applied: List[CandidateFeature] = []
            if (c - 1) in cost_2_raw_features:
                unary_candidates_to_be_applied.extend(cost_2_raw_features[c - 1])
            if (c - 1) in cost_2_unary_transformed:
                unary_candidates_to_be_applied.extend(cost_2_unary_transformed[c - 1])
            if (c - 1) in cost_2_binary_transformed:
                unary_candidates_to_be_applied.extend(cost_2_binary_transformed[c - 1])

            all_unary_features = self.generate_features(unary_transformations, unary_candidates_to_be_applied, all_evaluated_features)
            current_layer.extend(all_unary_features)

            #second binary
            #get length 2 partitions for current cost
            partition = self.get_length_2_partition(c-1)
            #print("bin: c: " + str(c) + " partition" + str(partition))

            #apply cross product from partitions
            binary_candidates_to_be_applied: List[CandidateFeature] = []
            for p in partition:
                lists_for_each_element: List[List[CandidateFeature]] = [[], []]
                for element in range(2):
                    if p[element] in cost_2_raw_features:
                        lists_for_each_element[element].extend(cost_2_raw_features[p[element]])
                    if p[element] in cost_2_unary_transformed:
                        lists_for_each_element[element].extend(cost_2_unary_transformed[p[element]])
                    if p[element] in cost_2_binary_transformed:
                        lists_for_each_element[element].extend(cost_2_binary_transformed[p[element]])

                for bt in binary_transformations:
                    list_of_combinations = self.generate_merge(lists_for_each_element[0], lists_for_each_element[1], bt.parent_feature_order_matters, bt.parent_feature_repetition_is_allowed)
                    #print(list_of_combinations)
                    for combo in list_of_combinations:
                        if bt.is_applicable(combo):
                            sympy_representation = bt.get_sympy_representation(
                                [p.get_sympy_representation() for p in combo])
                            try:
                                if len(sympy_representation.free_symbols) > 0:  # if expression is not constant
                                    if not sympy_representation in all_evaluated_features:
                                        bin_candidate = CandidateFeature(copy.deepcopy(bt), combo)
                                        bin_candidate.sympy_representation = copy.deepcopy(sympy_representation)
                                        all_evaluated_features.add(sympy_representation)
                                        binary_candidates_to_be_applied.append(bin_candidate)
                                    else:
                                        #print(str(bin_candidate) + " skipped: " + str(sympy_representation))
                                        pass
                                else:
                                    #print(str(bin_candidate) + " skipped: " + str(sympy_representation))
                                    pass
                            except:
                                pass
            current_layer.extend(binary_candidates_to_be_applied)

            #third: feature combinations
            #first variant: treat combination as a transformation
            #therefore, we can use the same partition as for binary data
            partition = self.get_length_2_partition(c)
            #print("combo c: " + str(c) + " partition" + str(partition))

            combinations_to_be_applied: List[CandidateFeature] = []
            for p in partition:
                lists_for_each_element: List[List[CandidateFeature]] = [[], []]
                for element in range(2):
                    if p[element] in cost_2_raw_features:
                        lists_for_each_element[element].extend(cost_2_raw_features[p[element]])
                    if p[element] in cost_2_unary_transformed:
                        lists_for_each_element[element].extend(cost_2_unary_transformed[p[element]])
                    if p[element] in cost_2_binary_transformed:
                        lists_for_each_element[element].extend(cost_2_binary_transformed[p[element]])
                    if p[element] in cost_2_combination:
                        lists_for_each_element[element].extend(cost_2_combination[p[element]])

                combinations_to_be_applied = self.generate_merge_for_combination(all_evaluated_features, lists_for_each_element[0], lists_for_each_element[1])
            current_layer.extend(combinations_to_be_applied)



            if unique_raw_combinations:
                length = len(current_layer)
                current_layer = self.filter_non_unique_combinations(current_layer)
                print("From " + str(length) + " combinations, we filter " +  str(length - len(current_layer)) + " nonunique raw feature combinations.")



            #now evaluate all from this layer
            #print(current_layer)
            print("----------- Evaluation of " + str(len(current_layer)) + " representations -----------")
            results = evaluate_candidates(current_layer)
            print("----------- Evaluation Finished -----------")

            #print(results)

            layer_end_time = time.time() - self.global_starting_time

            #calculate whether we drop the evaluated candidate
            for candidate in results:
                if type(candidate) != type(None):
                    candidate.runtime_properties['layer_end_time'] = layer_end_time

                    #print(str(candidate) + " -> " + str(candidate.runtime_properties['score']))


                    if candidate.runtime_properties['score'] > max_feature.runtime_properties['score']:
                        max_feature = candidate

                    if candidate.runtime_properties['passed']:
                        if isinstance(candidate, RawFeature):
                            if not c in cost_2_raw_features:
                                cost_2_raw_features[c]: List[CandidateFeature] = []
                            cost_2_raw_features[c].append(candidate)
                        elif isinstance(candidate.transformation, UnaryTransformation):
                            if not c in cost_2_unary_transformed:
                                cost_2_unary_transformed[c]: List[CandidateFeature] = []
                            cost_2_unary_transformed[c].append(candidate)
                        elif isinstance(candidate.transformation, IdentityTransformation):
                            if not c in cost_2_combination:
                                cost_2_combination[c]: List[CandidateFeature] = []
                            cost_2_combination[c].append(candidate)
                        else:
                            if not c in cost_2_binary_transformed:
                                cost_2_binary_transformed[c]: List[CandidateFeature] = []
                            cost_2_binary_transformed[c].append(candidate)
                    else:
                        if self.save_logs:
                            if not c in cost_2_dropped_evaluated_candidates:
                                cost_2_dropped_evaluated_candidates[c]: List[CandidateFeature] = []
                            cost_2_dropped_evaluated_candidates[c].append(candidate)
            


            satisfied_count = 0
            if c in cost_2_raw_features:
                satisfied_count += len(cost_2_raw_features[c])
            if c in cost_2_unary_transformed:
                satisfied_count += len(cost_2_unary_transformed[c])
            if c in cost_2_binary_transformed:
                satisfied_count += len(cost_2_binary_transformed[c])
            if c in cost_2_combination:
                satisfied_count += len(cost_2_combination[c])

            all_count = len(current_layer)
            if c == 1:
                all_count = len(cost_2_raw_features[c])


            print("Of " + str(all_count) + " candidate representations, " + str(satisfied_count) + " did satisfy the epsilon threshold.")


            if len(current_layer) > 0:
                if Config.get_default('score.test', 'False') == 'True':
                    print("\nBest representation found for complexity = " + str(c) + ": " + str(max_feature) + "\nmean cross-validation score: " + "{0:.2f}".format(max_feature.runtime_properties['score']) + ", score on test: " + "{0:.2f}".format(max_feature.runtime_properties['test_score']) + "\n")
                else:
                    print("\nBest representation found for complexity = " + str(c) + ": " + str(
                        max_feature) + "\nmean cross-validation score: " + "{0:.2f}".format(
                        max_feature.runtime_properties['score']) + "\n")
                #print("hyper: " + str(max_feature.runtime_properties['hyperparameters']))

                #print(max_feature.runtime_properties['fold_scores'])

            # upload best feature to OpenML
            if self.upload2openml:
                candidate2openml(max_feature, my_globale_module.classifier_global, self.reader.task, 'ComplexityDriven')


            if self.save_logs:
                pickle.dump(cost_2_raw_features, open(Config.get_default("tmp.folder", "/tmp") + "/data_raw.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                pickle.dump(cost_2_unary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_unary.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                pickle.dump(cost_2_binary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_binary.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                pickle.dump(cost_2_combination, open(Config.get_default("tmp.folder", "/tmp") + "/data_combination.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                pickle.dump(cost_2_dropped_evaluated_candidates, open(Config.get_default("tmp.folder", "/tmp") + "/data_dropped.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)


            max_feature_per_complexity[c] = max_feature


            if type(self.c_max) == type(None) and c > 2:
                # calculate harmonic mean
                harmonic_means = [0.0]*3
                for h_i in range(len(harmonic_means)):
                    simplicity_cum_score = self.getSimplicityScore(max_feature_per_complexity[c-h_i].get_complexity(), c,
                                                                       cost_2_raw_features, cost_2_unary_transformed,
                                                                       cost_2_binary_transformed, cost_2_combination)
                    accuracy_cum_score = self.getAccuracyScore(max_feature_per_complexity[c-h_i].runtime_properties['score'], c,
                                                                   cost_2_raw_features, cost_2_unary_transformed,
                                                                   cost_2_binary_transformed, cost_2_combination)

                    harmonic_means[h_i] = self.harmonic_mean(simplicity_cum_score, accuracy_cum_score)
                    #print(str(max_feature_per_complexity[c-h_i]) + ": " + str(harmonic_means[h_i]) + " h: " + str(h_i))

                if harmonic_means[2] >= harmonic_means[1] and harmonic_means[2] >= harmonic_means[0]:
                    print("Best Harmonic Mean: " + str(max_feature_per_complexity[c-2]))
                    break


            if type(self.max_timestamp) != type(None) and time.time() >= self.max_timestamp:
                break

            c += 1

            if type(self.c_max) != type(None) and self.c_max < c:
                break
예제 #20
0
    def run(self):

        self.global_starting_time = time.time()

        # generate all candidates
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        unary_transformations, binary_transformations = self.transformation_producer(self.train_X_all, self.raw_features)



        cost_2_raw_features: Dict[int, List[CandidateFeature]] = {}
        cost_2_unary_transformed: Dict[int, List[CandidateFeature]] = {}
        cost_2_binary_transformed: Dict[int, List[CandidateFeature]] = {}
        cost_2_combination: Dict[int, List[CandidateFeature]] = {}

        if self.save_logs:
            cost_2_dropped_evaluated_candidates: Dict[int, List[CandidateFeature]] = {}

        self.complexity_delta = 1.0

        unique_raw_combinations = False


        baseline_score = 0.0#self.evaluate_candidates([CandidateFeature(DummyOneTransformation(None), [self.raw_features[0]])])[0]['score']
        #print("baseline: " + str(baseline_score))


        max_feature = CandidateFeature(IdentityTransformation(None), [self.raw_features[0]])
        max_feature.runtime_properties['score'] = -float("inf")

        max_feature_per_complexity: Dict[int, CandidateFeature] = {}

        all_evaluated_features = set()

        my_globale_module.global_starting_time_global = copy.deepcopy(self.global_starting_time)
        my_globale_module.grid_search_parameters_global = copy.deepcopy(self.grid_search_parameters)
        my_globale_module.score_global = copy.deepcopy(self.score)
        my_globale_module.classifier_global = copy.deepcopy(self.classifier)
        my_globale_module.target_train_folds_global = copy.deepcopy(self.target_train_folds)
        my_globale_module.target_test_folds_global = copy.deepcopy(self.target_test_folds)
        my_globale_module.train_y_all_target_global = copy.deepcopy(self.train_y_all_target)
        my_globale_module.test_target_global = copy.deepcopy(self.test_target)
        my_globale_module.max_timestamp_global = copy.deepcopy(self.max_timestamp)
        my_globale_module.preprocessed_folds_global = copy.deepcopy(self.preprocessed_folds)
        my_globale_module.epsilon_global = copy.deepcopy(self.epsilon)
        my_globale_module.complexity_delta_global = copy.deepcopy(self.complexity_delta)




        ############################

        # start

        ############################

        current_layer = []
        c = 1

        cost_2_raw_features[c]: List[CandidateFeature] = []
        # print(self.raw_features)
        for raw_f in self.raw_features:
            sympy_representation = sympy.Symbol('X' + str(raw_f.column_id))
            raw_f.sympy_representation = sympy_representation
            all_evaluated_features.add(sympy_representation)
            if raw_f.is_numeric():
                current_layer.append(raw_f)
                # print("numeric: " + str(raw_f))
            else:
                raw_f.runtime_properties['score'] = 0.0
                cost_2_raw_features[c].append(raw_f)
                # print("nonnumeric: " + str(raw_f))

            self.materialize_raw_features(raw_f)
            raw_f.derive_properties(raw_f.runtime_properties['train_transformed'][0])

        # now evaluate all from this layer
        # print(current_layer)
        print("----------- Evaluation of " + str(len(current_layer)) + " representations -----------")
        results = evaluate_candidates(current_layer)
        print("----------- Evaluation Finished -----------")

        layer_end_time = time.time() - self.global_starting_time

        # calculate whether we drop the evaluated candidate
        for candidate in results:
            if type(candidate) != type(None):
                candidate.runtime_properties['layer_end_time'] = layer_end_time

                # print(str(candidate) + " -> " + str(candidate.runtime_properties['score']))

                if candidate.runtime_properties['score'] > max_feature.runtime_properties['score']:
                    max_feature = candidate

                if candidate.runtime_properties['passed']:
                    if isinstance(candidate, RawFeature):
                        if not c in cost_2_raw_features:
                            cost_2_raw_features[c]: List[CandidateFeature] = []
                        cost_2_raw_features[c].append(candidate)
                    elif isinstance(candidate.transformation, UnaryTransformation):
                        if not c in cost_2_unary_transformed:
                            cost_2_unary_transformed[c]: List[CandidateFeature] = []
                        cost_2_unary_transformed[c].append(candidate)
                    elif isinstance(candidate.transformation, IdentityTransformation):
                        if not c in cost_2_combination:
                            cost_2_combination[c]: List[CandidateFeature] = []
                        cost_2_combination[c].append(candidate)
                    else:
                        if not c in cost_2_binary_transformed:
                            cost_2_binary_transformed[c]: List[CandidateFeature] = []
                        cost_2_binary_transformed[c].append(candidate)
                else:
                    if self.save_logs:
                        if not c in cost_2_dropped_evaluated_candidates:
                            cost_2_dropped_evaluated_candidates[c]: List[CandidateFeature] = []
                        cost_2_dropped_evaluated_candidates[c].append(candidate)

        print(cost_2_raw_features[c])

        #select next representation

        #next_id = np.argmax([rf.runtime_properties['score'] for rf in cost_2_raw_features[1]])
        next_id = np.random.randint(len(cost_2_raw_features[1]))
        next_rep = cost_2_raw_features[c][next_id]

        max_rep = next_rep

        current_lambda = 0

        number_runs= 200

        rep_succesion = []

        for runs in range(number_runs):
            rep_succesion.append(next_rep)
            #print('next: ' + str(next_rep))

            #######################
            #create branch
            #######################
            current_layer = []
            # first unary
            if not isinstance(next_rep.transformation, IdentityTransformation):
                current_layer.extend(self.generate_features(unary_transformations, [next_rep], all_evaluated_features))

            # second binary
            if not isinstance(next_rep.transformation, IdentityTransformation):
                binary_candidates_to_be_applied = []
                for bt in binary_transformations:
                    list_of_combinations = self.generate_merge([next_rep], cost_2_raw_features[1],
                                                               bt.parent_feature_order_matters,
                                                               bt.parent_feature_repetition_is_allowed)
                    # print(list_of_combinations)
                    for combo in list_of_combinations:
                        if bt.is_applicable(combo):
                            sympy_representation = bt.get_sympy_representation(
                                [p.get_sympy_representation() for p in combo])
                            try:
                                if len(sympy_representation.free_symbols) > 0:  # if expression is not constant
                                    if not sympy_representation in all_evaluated_features:
                                        bin_candidate = CandidateFeature(copy.deepcopy(bt), combo)
                                        bin_candidate.sympy_representation = copy.deepcopy(sympy_representation)
                                        binary_candidates_to_be_applied.append(bin_candidate)
                                    else:
                                        # print(str(bin_candidate) + " skipped: " + str(sympy_representation))
                                        pass
                                else:
                                    # print(str(bin_candidate) + " skipped: " + str(sympy_representation))
                                    pass
                            except:
                                pass
                current_layer.extend(binary_candidates_to_be_applied)

            # third: feature combinations
            '''
            combinations_to_be_applied = self.generate_merge_for_combination(all_evaluated_features, [next_rep], cost_2_raw_features[1])
            current_layer.extend(combinations_to_be_applied)
            '''
            #print(current_layer)

            # select next representation
            shuffled_indices = np.arange(len(current_layer))
            np.random.shuffle(shuffled_indices)
            for rep_i in range(len(current_layer)):
                new_rep = current_layer[shuffled_indices[rep_i]]
                all_evaluated_features.add(next_rep.sympy_representation)

                new_rep = evaluate_candidates([new_rep])[0]
                if new_rep != None:
                    break

            print(str(new_rep) + " cv score: " + str(new_rep.runtime_properties['score']) + " test: " + str(
                new_rep.runtime_properties['test_score']))
            if new_rep == None:
                break

            if new_rep.runtime_properties['score'] * self.score._sign > max_rep.runtime_properties['score']:
                max_rep = new_rep
                print("max representation: " + str(max_rep))

            if new_rep.runtime_properties['score'] * self.score._sign <= rep_succesion[-1*(current_lambda+1)].runtime_properties['score']:
                current_lambda += 1
            if current_lambda >= self.lambda_threshold:
                next_rep = max_rep
                current_lambda = 0
            else:
                next_rep = new_rep
예제 #21
0
    def run(self):

        self.global_starting_time = time.time()

        # generate all candidates
        self.generate()

        for raw_f in self.raw_features:
            raw_f.properties['type'] = 'float'

        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        myfolds = copy.deepcopy(list(self.preprocessed_folds))

        R_w = 15000
        max_iterations = 15  #15
        threshold_f = 0.001
        epsilon_w = 0.01
        threshold_w = 0.0

        all_features = self.produce_features()

        print(len(all_features))

        self.base_features = CandidateFeature(
            IdentityTransformation(len(self.raw_features)), self.raw_features)

        results = {}

        for i in range(max_iterations):

            print("base features: " + str(self.base_features))

            results[i] = self.evaluate_candidates([self.base_features],
                                                  myfolds)[0]
            print(results[i])
            print(results[i].runtime_properties)

            feature_scores = self.evaluate_ranking(all_features)
            ids = np.argsort(np.array(feature_scores) * -1)
            print(feature_scores)

            best_improvement_so_far = np.NINF
            best_Feature_So_Far = None
            evaluated_candidate_features = 0
            for f_i in range(len(feature_scores)):
                if feature_scores[ids[f_i]] < threshold_f:
                    break

                current_feature_set = CandidateFeature(
                    IdentityTransformation(2),
                    [self.base_features, all_features[ids[f_i]]])
                print(current_feature_set)
                result = self.evaluate_candidates([current_feature_set],
                                                  myfolds)[0]
                evaluated_candidate_features += 1
                improvement = result.runtime_properties['score'] - results[
                    i].runtime_properties['score']

                print("Candidate: " + str(all_features[ids[f_i]]) +
                      " score: " + str(result.runtime_properties['score']) +
                      " info: " + str(feature_scores[ids[f_i]]))
                print("improvement: " + str(improvement))
                if improvement > best_improvement_so_far:
                    best_improvement_so_far = improvement
                    best_Feature_So_Far = result

                    results[i] = best_Feature_So_Far
                    results[i].runtime_properties[
                        'score_improvement'] = improvement
                    results[i].runtime_properties[
                        'info_gain'] = feature_scores[ids[f_i]]
                    results[i].runtime_properties['global time'] = time.time(
                    ) - self.global_starting_time

                    pickle.dump(
                        results,
                        open(
                            Config.get("tmp.folder") + "/explorekit_results.p",
                            "wb"))

                if improvement >= epsilon_w:
                    break
                if evaluated_candidate_features >= R_w:
                    break

            if best_improvement_so_far > threshold_w:
                self.base_features = best_Feature_So_Far
            else:
                return self.base_features

            all_features_new = []
            for i in range(len(feature_scores)):
                if feature_scores[i] >= 0:
                    all_features_new.append(all_features[i])
            all_features = all_features_new
        return results
예제 #22
0
파일: bench_new.py 프로젝트: BigDaMa/DFS
ground_truth = [28, 48, 64, 105, 128, 153, 241, 281, 318, 336, 338, 378, 433, 442, 451, 453, 455, 472, 475, 493]


print(len(ground_truth))

mask = np.zeros(len(numeric_representations), dtype=bool)
for i in range(len(numeric_representations)):
	for g in ground_truth:
		if str(numeric_representations[i]) == 'V' + str(g):
			mask[i] = True
			break

print(np.sum(mask))

all_features = CandidateFeature(IdentityTransformation(-1), numeric_representations)
all_standardized = CandidateFeature(MinMaxScalingTransformation(), [all_features])

#foreigner = np.array(X_train[:,7])
#gender = np.array(['female' in personal_status for personal_status in X_train[:,15]])

scoring = {'auc': make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True)}


#for count_i in range(10):
parameter_grid = {'model__penalty': ['l2'], 'model__C': [1], 'model__solver': ['lbfgs'],
				  'model__class_weight': ['balanced'], 'model__max_iter': [10000], 'model__multi_class': ['auto']}

my_pipeline = Pipeline([('features', all_standardized.pipeline),
						#('selection', L1Selection()),
						#('selection', SelectKBest(score_func=mutual_info_classif,k=10)),
예제 #23
0
def run_pipeline(which_features_to_use, c=None, runs=1):

	model = LogisticRegression

	if type(c) == type(None):
		c = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
	else:
		c = [c]

	parameter_grid = {'c__penalty': ['l2'], 'c__C': c, 'c__solver': ['lbfgs'],
					  'c__class_weight': ['balanced'], 'c__max_iter': [10000], 'c__multi_class': ['auto']}

	auc = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True)

	numeric_representations = pickle.load(open("/home/felix/phd/feature_constraints/" + str(which_experiment) + "/features.p", "rb"))

	#print(len(numeric_representations))

	#X_train, X_test, y_train, y_test
	X_train = pickle.load(open("/home/felix/phd/feature_constraints/" + str(which_experiment) + "/X_train.p", "rb"))
	X_test = pickle.load(open("/home/felix/phd/feature_constraints/" + str(which_experiment) + "/X_test.p", "rb"))
	y_train = pickle.load(open("/home/felix/phd/feature_constraints/" + str(which_experiment) + "/y_train.p", "rb"))
	y_test = pickle.load(open("/home/felix/phd/feature_constraints/" + str(which_experiment) + "/y_test.p", "rb"))



	#generate pipeline
	all_selected_features = []
	for i in range(len(which_features_to_use)):
		if which_features_to_use[i]:
			all_selected_features.append(numeric_representations[i])

	all_features = CandidateFeature(IdentityTransformation(-1), all_selected_features)
	all_standardized = CandidateFeature(MinMaxScalingTransformation(), [all_features])

	my_pipeline = Pipeline([('f', all_standardized.pipeline),
							('c', model())
							])

	cv_scores = []
	test_scores = []
	pred_test = None
	proba_pred_test = None

	if runs > 1:
		for r in range(runs):
			kfolds = StratifiedKFold(10, shuffle=True, random_state=42+r)
			pipeline = GridSearchCV(my_pipeline, parameter_grid, cv=kfolds.split(X_train, y_train), scoring=auc, n_jobs=4)
			pipeline.fit(X_train, y_train)

			pred_test = pipeline.predict(X_test)
			proba_pred_test = pipeline.predict_proba(X_test)

			test_auc = auc(pipeline, X_test, y_test)

			cv_scores.append(pipeline.best_score_)
			test_scores.append(test_auc)

		std_loss = np.std(cv_scores)
		loss = np.average(cv_scores)
	else:
		kfolds = StratifiedKFold(10, shuffle=True, random_state=42)
		pipeline = GridSearchCV(my_pipeline, parameter_grid, cv=kfolds.split(X_train, y_train), scoring=auc, n_jobs=4)
		pipeline.fit(X_train, y_train)

		pred_test = pipeline.predict(X_test)
		proba_pred_test = pipeline.predict_proba(X_test)

		test_auc = auc(pipeline, X_test, y_test)

		std_loss = pipeline.cv_results_['std_test_score'][pipeline.best_index_]
		#std_loss = np.min([pipeline.cv_results_['split' + str(split)+ '_test_score'][pipeline.best_index_] for split in range(10)])
		loss = pipeline.cv_results_['mean_test_score'][pipeline.best_index_]
		test_scores.append(test_auc)

		print(pipeline.classes_)

	return loss, np.average(test_scores), pred_test, std_loss, proba_pred_test