class ExploreKitSelection: def __init__( self, dataset_config, classifier=LogisticRegression(), grid_search_parameters={ 'classifier__penalty': ['l2'], 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'classifier__solver': ['lbfgs'] }): self.dataset_config = dataset_config self.classifier = classifier self.grid_search_parameters = grid_search_parameters #generate all possible combinations of features def generate(self): s = Splitter(train_fraction=[0.6, 10000000], seed=42) #s = Splitter(train_fraction=[0.1, 10000000], seed=42) self.dataset = Reader(self.dataset_config[0], self.dataset_config[1], s) raw_features = self.dataset.read() g = Generator(raw_features) self.candidates = g.generate_all_candidates() print("Number candidates: " + str(len(self.candidates))) #rank and select features def random_select(self, k: int): arr = np.arange(len(self.candidates)) np.random.shuffle(arr) return arr[0:k] def select_interpretable(self, k: int): inv_map = {v: k for k, v in self.candidate_id_to_ranked_id.items()} selected = [] for i in range(k): selected.append(inv_map[i]) return selected def generate_target(self): current_target = self.dataset.splitted_target['train'] self.current_target = LabelEncoder().fit_transform(current_target) def evaluate(self, candidate: CandidateFeature, runs=10): for i in range(runs): candidate.pipeline.fit(self.dataset.splitted_values['train']) candidate.pipeline.transform(self.dataset.splitted_values['train']) def create_starting_features(self): Fi: List[RawFeature] = self.dataset.raw_features #materialize and numpyfy the features starting_feature_matrix = np.zeros( (Fi[0].materialize()['train'].shape[0], len(Fi))) for f_index in range(len(Fi)): starting_feature_matrix[:, f_index] = Fi[f_index].materialize( )['train'] return starting_feature_matrix def my_arg_sort(self, seq): # http://stackoverflow.com/questions/3382352/equivalent-of-numpy-argsort-in-basic-python/3383106#3383106 # non-lambda version by Tony Veijalainen return [i for (v, i) in sorted((v, i) for (i, v) in enumerate(seq))] def get_interpretability_ranking(self): #high interpretability -> low interpretability interpretability_ids = self.my_arg_sort(self.candidates) self.candidate_id_to_ranked_id = {} for i in range(len(interpretability_ids)): self.candidate_id_to_ranked_id[interpretability_ids[i]] = i def get_traceability_ranking(self): # high traceability -> low traceability self.traceability: List[float] = [] for c_i in range(len(self.candidates)): self.traceability.append( self.candidates[c_i].calculate_traceability()) ids = np.argsort(np.array(self.traceability) * -1) self.candidate_id_to_ranked_id = {} for i in range(len(ids)): self.candidate_id_to_ranked_id[ids[i]] = i all_data = {} all_data['my_dict'] = self.candidate_id_to_ranked_id all_data['traceability'] = self.traceability pickle.dump(all_data, open("/tmp/traceability.p", "wb")) def get_interpretability(self, candidate_id): return 1.0 - ((self.candidate_id_to_ranked_id[candidate_id] + 1) / float(len(self.candidate_id_to_ranked_id))) def evaluate_candidates(self, candidates): self.preprocessed_folds = [] for train, test in StratifiedKFold(n_splits=10, random_state=42).split( self.dataset.splitted_values['train'], self.current_target): self.preprocessed_folds.append((train, test)) pool = mp.Pool(processes=int(Config.get("parallelism"))) results = pool.map(self.evaluate_single_candidate, candidates) return results ''' def evaluate_candidates(self, candidates): self.preprocessed_folds = [] for train, test in StratifiedKFold(n_splits=10, random_state=42).split(self.dataset.splitted_values['train'], self.current_target): self.preprocessed_folds.append((train, test)) results = [] for c in candidates: results.append(self.evaluate_single_candidate(c)) return results ''' def evaluate_single_candidate(self, candidate): result = {} time_start_gs = time.time() runs = 10 try: self.evaluate(candidate, runs) #print("feature: " + str(candidate) + " -> " + str(new_score)) except Exception as e: print(str(candidate) + " -> " + str(e)) pass result['candidate'] = candidate result['time'] = (time.time() - time_start_gs) / float(runs) return result ''' def evaluate_single_candidate(self, candidate): new_score = -1.0 new_score = self.evaluate(candidate) return new_score ''' def run(self): # generate all candidates self.generate() #starting_feature_matrix = self.create_starting_features() self.generate_target() candidate_name_to_id = {} for c_i in range(len(self.candidates)): candidate_name_to_id[self.candidates[c_i].get_name()] = c_i pickle.dump(candidate_name_to_id, open("/tmp/name2id.p", "wb")) pickle.dump(self.candidates, open("/tmp/all_candiates.p", "wb")) self.get_interpretability_ranking() #self.get_traceability_ranking() #evaluate starting matrix #start_score = self.evaluate(starting_feature_matrix) start_score = -1 print("start score: " + str(start_score)) #get candidates that should be evaluated ranked_selected_candidate_ids = self.select_interpretable( len(self.candidates)) start_time = time.time() results = self.evaluate_candidates( np.array(self.candidates)[ranked_selected_candidate_ids]) print("evaluation time: " + str((time.time() - start_time) / 60) + " min") return start_score, results, ranked_selected_candidate_ids
class SissoExperiment: def __init__(self, dataset_config, classifier=LogisticRegression(), grid_search_parameters={'classifier__penalty': ['l2'], 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'classifier__solver': ['lbfgs']}): self.dataset_config = dataset_config self.classifier = classifier self.grid_search_parameters = grid_search_parameters #generate all possible combinations of features def generate(self): s = Splitter(train_fraction=[0.6, 10000000], seed=42) #s = Splitter(train_fraction=[0.1, 10000000], seed=42) self.dataset = Reader(self.dataset_config[0], self.dataset_config[1], s) raw_features = self.dataset.read() #g = Generator(raw_features) #self.candidates = g.generate_all_candidates() #print("Number candidates: " + str(len(self.candidates))) def generate_target(self): current_target = self.dataset.splitted_target['train'] self.current_target = LabelEncoder().fit_transform(current_target) def evaluate(self, candidate, score=make_scorer(roc_auc_score, average='micro'), folds=10): parameters = self.grid_search_parameters ''' if not isinstance(candidate, CandidateFeature): pipeline = Pipeline([('features',FeatureUnion( [(p.get_name(), p.pipeline) for p in candidate] )), ('classifier', self.classifier) ]) else: pipeline = Pipeline([('features', FeatureUnion( [ (candidate.get_name(), candidate.pipeline) ])), ('classifier', self.classifier) ]) ''' result = {} '''' clf = GridSearchCV(pipeline, parameters, cv=self.preprocessed_folds, scoring=score, iid=False, error_score='raise') clf.fit(self.dataset.splitted_values['train'], self.current_target) result['score'] = clf.best_score_ result['hyperparameters'] = clf.best_params_ ''' feateng_cols = ['age', 'sex', 'chest', 'resting_blood_pressure', 'serum_cholestoral', 'fasting_blood_sugar', 'resting_electrocardiographic_results', 'maximum_heart_rate_achieved', 'exercise_induced_angina', 'oldpeak', 'slope', 'number_of_major_vessels', 'thal'] print(self.current_target) ''' df = pd.DataFrame(data=self.dataset.splitted_values['train'], columns=feateng_cols) df['id'] = pd.Series(range(len(df)), index=df.index) df_target = pd.DataFrame(data=self.current_target, columns=['target']) df_target['id'] = pd.Series(range(len(df)), index=df_target.index) print(df) es = ft.EntitySet("example") es.entity_from_dataframe(dataframe=df, entity_id="heart",index="id") es.entity_from_dataframe(dataframe=df_target, entity_id="target", index="id") new_relationship = ft.Relationship(es["heart"]["id"], es["target"]["id"]) es = es.add_relationship(new_relationship) ''' df = pd.DataFrame(data=self.dataset.splitted_values['train'], columns=feateng_cols) df['id'] = pd.Series(range(len(df)), index=df.index) df['target'] = pd.Series(self.current_target, index=df.index).map({0: 'healthy', 1: 'ill'}) es = ft.EntitySet("example") es.entity_from_dataframe(dataframe=df, entity_id="heart", index="id") es.normalize_entity(base_entity_id='heart', new_entity_id='target_e', index='id', additional_variables=['target']) print(es) feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity='target_e', max_depth=6, verbose=1, n_jobs=4, max_features=2) drop_cols = [] for col in feature_matrix: if col == 'target': pass else: if 'target' in col: drop_cols.append(col) feature_matrix = feature_matrix[[x for x in feature_matrix if x not in drop_cols]] print(feature_matrix.columns) return result ''' def evaluate_candidates(self, candidates): self.preprocessed_folds = [] for train, test in StratifiedKFold(n_splits=10, random_state=42).split(self.dataset.splitted_values['train'], self.current_target): self.preprocessed_folds.append((train, test)) pool = mp.Pool(processes=int(Config.get("parallelism"))) results = pool.map(self.evaluate_single_candidate, candidates) return results ''' def evaluate_candidates(self, candidates): self.preprocessed_folds = [] for train, test in StratifiedKFold(n_splits=10, random_state=42).split(self.dataset.splitted_values['train'], self.current_target): self.preprocessed_folds.append((train, test)) results = [] for c in candidates: results.append(self.evaluate_single_candidate(c)) return results ''' def evaluate_single_candidate(self, candidate): result = {} time_start_gs = time.time() try: result = self.evaluate(candidate) #print("feature: " + str(candidate) + " -> " + str(new_score)) except Exception as e: print(str(candidate) + " -> " + str(e)) result['score'] = -1.0 result['hyperparameters'] = {} pass result['candidate'] = candidate result['time'] = time.time() - time_start_gs return result ''' def evaluate_single_candidate(self, candidate): new_score = -1.0 new_score = self.evaluate(candidate) return new_score def run(self): # generate all candidates self.generate() #starting_feature_matrix = self.create_starting_features() self.generate_target() print([r.name for r in self.dataset.raw_features]) plain_attributes = CandidateFeature(IdentityTransformation(len(self.dataset.raw_features)), self.dataset.raw_features) self.evaluate_candidates([plain_attributes])
all_data = pickle.load(open(file, "rb")) feature_predictions = pickle.load( open('/home/felix/phd/feature_predictions/all_data_predictions.p', "rb")) name2result_predictions = {} for result in feature_predictions: name2result_predictions[str(result['candidate'])] = result dataset_config = (Config.get('statlog_heart.csv'), int(Config.get('statlog_heart.target'))) s = Splitter(train_fraction=[0.6, 10000000], seed=42) dataset = Reader(dataset_config[0], dataset_config[1], s) raw_features = dataset.read() X = dataset.splitted_values['train'] #delta mean -> avg, min, max gain def calculate_MSE(candidate: CandidateFeature, X): ys = [] for p in candidate.parents: p.fit(X) y = p.transform(X) ys.append(y) #correlation #score = np.corrcoef(np.matrix(ys[0]).A1, np.matrix(ys[1]).A1)[0,1]
class SissoExperiment: def __init__( self, dataset_config, classifier=LogisticRegression(), grid_search_parameters={ 'classifier__penalty': ['l2'], 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'classifier__solver': ['lbfgs'] }): self.dataset_config = dataset_config self.classifier = classifier self.grid_search_parameters = grid_search_parameters #generate all possible combinations of features def generate(self): s = Splitter(train_fraction=[0.6, 10000000], seed=42) #s = Splitter(train_fraction=[0.1, 10000000], seed=42) self.dataset = Reader(self.dataset_config[0], self.dataset_config[1], s) raw_features = self.dataset.read() g = Generator(raw_features) self.candidates = g.generate_all_candidates() print("Number candidates: " + str(len(self.candidates))) def generate_target(self): current_target = self.dataset.splitted_target['train'] self.current_target = LabelEncoder().fit_transform(current_target) def evaluate(self, candidate, score=make_scorer(roc_auc_score, average='micro'), folds=10): parameters = self.grid_search_parameters ''' if not isinstance(candidate, CandidateFeature): pipeline = Pipeline([('features',FeatureUnion( [(p.get_name(), p.pipeline) for p in candidate] )), ('classifier', self.classifier) ]) else: pipeline = Pipeline([('features', FeatureUnion( [ (candidate.get_name(), candidate.pipeline) ])), ('classifier', self.classifier) ]) ''' result = {} '''' clf = GridSearchCV(pipeline, parameters, cv=self.preprocessed_folds, scoring=score, iid=False, error_score='raise') clf.fit(self.dataset.splitted_values['train'], self.current_target) result['score'] = clf.best_score_ result['hyperparameters'] = clf.best_params_ ''' feateng_cols = [ 'age', 'sex', 'chest', 'resting_blood_pressure', 'serum_cholestoral', 'fasting_blood_sugar', 'resting_electrocardiographic_results', 'maximum_heart_rate_achieved', 'exercise_induced_angina', 'oldpeak', 'slope', 'number_of_major_vessels', 'thal' ] print(self.current_target) afreg = AutoFeatRegression(n_jobs=4, feateng_cols=feateng_cols) #df = afreg.fit_transform(pd.DataFrame(data=self.dataset.splitted_values['train'], columns=feateng_cols), self.current_target) np.save('/tmp/X', self.dataset.splitted_values['train']) np.save('/tmp/y', self.current_target) return result ''' def evaluate_candidates(self, candidates): self.preprocessed_folds = [] for train, test in StratifiedKFold(n_splits=10, random_state=42).split(self.dataset.splitted_values['train'], self.current_target): self.preprocessed_folds.append((train, test)) pool = mp.Pool(processes=int(Config.get("parallelism"))) results = pool.map(self.evaluate_single_candidate, candidates) return results ''' def evaluate_candidates(self, candidates): self.preprocessed_folds = [] for train, test in StratifiedKFold(n_splits=10, random_state=42).split( self.dataset.splitted_values['train'], self.current_target): self.preprocessed_folds.append((train, test)) results = [] for c in candidates: results.append(self.evaluate_single_candidate(c)) return results ''' def evaluate_single_candidate(self, candidate): result = {} time_start_gs = time.time() try: result = self.evaluate(candidate) #print("feature: " + str(candidate) + " -> " + str(new_score)) except Exception as e: print(str(candidate) + " -> " + str(e)) result['score'] = -1.0 result['hyperparameters'] = {} pass result['candidate'] = candidate result['time'] = time.time() - time_start_gs return result ''' def evaluate_single_candidate(self, candidate): new_score = -1.0 new_score = self.evaluate(candidate) return new_score def run(self): # generate all candidates self.generate() #starting_feature_matrix = self.create_starting_features() self.generate_target() print([r.name for r in self.dataset.raw_features]) plain_attributes = CandidateFeature( IdentityTransformation(len(self.dataset.raw_features)), self.dataset.raw_features) self.evaluate_candidates([plain_attributes])
class ExploreKitSelection_iterative_search: def __init__( self, dataset_config, classifier=LogisticRegression(), grid_search_parameters={ 'classifier__penalty': ['l2'], 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'classifier__solver': ['lbfgs'], 'classifier__class_weight': ['balanced'], 'classifier__max_iter': [10000] }): self.dataset_config = dataset_config self.classifier = classifier self.grid_search_parameters = grid_search_parameters #generate all possible combinations of features def generate(self): s = Splitter(train_fraction=[0.6, 10000000], seed=42) #s = Splitter(train_fraction=[0.1, 10000000], seed=42) self.dataset = Reader(self.dataset_config[0], self.dataset_config[1], s) raw_features = self.dataset.read() g = Generator(raw_features) self.candidates = g.generate_all_candidates() print("Number candidates: " + str(len(self.candidates))) #rank and select features def random_select(self, k: int): arr = np.arange(len(self.candidates)) np.random.shuffle(arr) return arr[0:k] def generate_target(self): current_target = self.dataset.splitted_target['train'] self.current_target = LabelEncoder().fit_transform(current_target) def evaluate(self, candidate, score=make_scorer(roc_auc_score, average='micro'), folds=10): parameters = self.grid_search_parameters if not isinstance(candidate, CandidateFeature): pipeline = Pipeline([('features', FeatureUnion([(p.get_name(), p.pipeline) for p in candidate])), ('classifier', self.classifier)]) else: pipeline = Pipeline([('features', FeatureUnion([(candidate.get_name(), candidate.pipeline)])), ('classifier', self.classifier)]) result = {} clf = GridSearchCV(pipeline, parameters, cv=self.preprocessed_folds, scoring=score, iid=False, error_score='raise') clf.fit(self.dataset.splitted_values['train'], self.current_target) result['score'] = clf.best_score_ result['hyperparameters'] = clf.best_params_ return result def create_starting_features(self): Fi: List[RawFeature] = self.dataset.raw_features #materialize and numpyfy the features starting_feature_matrix = np.zeros( (Fi[0].materialize()['train'].shape[0], len(Fi))) for f_index in range(len(Fi)): starting_feature_matrix[:, f_index] = Fi[f_index].materialize( )['train'] return starting_feature_matrix def evaluate_candidates(self, candidates): self.preprocessed_folds = [] for train, test in StratifiedKFold(n_splits=10, random_state=42).split( self.dataset.splitted_values['train'], self.current_target): self.preprocessed_folds.append((train, test)) pool = mp.Pool(processes=int(Config.get("parallelism"))) results = pool.map(self.evaluate_single_candidate, candidates) return results ''' def evaluate_candidates(self, candidates): self.preprocessed_folds = [] for train, test in StratifiedKFold(n_splits=10, random_state=42).split(self.dataset.splitted_values['train'], self.current_target): self.preprocessed_folds.append((train, test)) results = [] for c in candidates: results.append(self.evaluate_single_candidate(c)) return results ''' def evaluate_single_candidate(self, candidate): result = {} time_start_gs = time.time() try: result = self.evaluate(candidate) #print("feature: " + str(candidate) + " -> " + str(new_score)) except Exception as e: print(str(candidate) + " -> " + str(e)) result['score'] = -1.0 result['hyperparameters'] = {} pass result['candidate'] = candidate result['time'] = time.time() - time_start_gs return result ''' def evaluate_single_candidate(self, candidate): new_score = -1.0 new_score = self.evaluate(candidate) return new_score ''' #https://stackoverflow.com/questions/10035752/elegant-python-code-for-integer-partitioning def partition(self, number): answer = set() answer.add((number, )) for x in range(1, number): for y in self.partition(number - x): answer.add(tuple(sorted((x, ) + y))) return answer def get_all_features_below_n_cost(self, cost): filtered_candidates = [] for i in range(len(self.candidates)): if (self.candidates[i].get_number_of_transformations() + 1) <= cost: filtered_candidates.append(self.candidates[i]) return filtered_candidates def get_all_features_equal_n_cost(self, cost): filtered_candidates = [] for i in range(len(self.candidates)): if (self.candidates[i].get_number_of_transformations() + 1) == cost: filtered_candidates.append(self.candidates[i]) return filtered_candidates def get_all_possible_representations_for_step_x(self, x): all_representations = set() partitions = self.partition(x) #get candidates of partitions candidates_with_cost_x = {} for i in range(x + 1): candidates_with_cost_x[i] = self.get_all_features_equal_n_cost(i) for p in partitions: current_list = itertools.product( *[candidates_with_cost_x[pi] for pi in p]) for c_output in current_list: if len(set(c_output)) == len(p): all_representations.add(frozenset(c_output)) return all_representations def filter_failing_features(self): working_features: List[CandidateFeature] = [] for candidate in self.candidates: try: candidate.fit(self.dataset.splitted_values['train']) candidate.transform(self.dataset.splitted_values['train']) except: continue working_features.append(candidate) return working_features def run(self): # generate all candidates self.generate() #starting_feature_matrix = self.create_starting_features() self.generate_target() working_features = self.filter_failing_features() all_f = CandidateFeature(IdentityTransformation(len(working_features)), working_features) selection = CandidateFeature( FeatureSelectionTransformation( 1, 2, LogisticRegression(penalty='l2', solver='lbfgs', class_weight='balanced', max_iter=10000)), [all_f]) results = self.evaluate_candidates([selection]) new_scores = [r['score'] for r in results] best_id = np.argmax(new_scores) print(results[best_id])
class EvaluationFramework: def __init__( self, dataset_config, classifier=LogisticRegression, grid_search_parameters={ 'classifier__penalty': ['l2'], 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'classifier__solver': ['lbfgs'], 'classifier__class_weight': ['balanced'], 'classifier__max_iter': [10000], 'classifier__multi_class': ['auto'] }, transformation_producer=get_transformation_for_feature_space): self.dataset_config = dataset_config self.classifier = classifier self.grid_search_parameters = grid_search_parameters self.transformation_producer = transformation_producer #generate all possible combinations of features def generate(self, seed=42): if type(self.reader) == type(None): s = None if isinstance(self.classifier(), ClassifierMixin): s = Splitter(train_fraction=[0.6, 10000000], valid_fraction=0.0, test_fraction=0.4, seed=seed) elif isinstance(self.classifier(), RegressorMixin): s = RandomSplitter(train_fraction=[0.6, 10000000], valid_fraction=0.0, test_fraction=0.4, seed=seed) else: pass self.dataset = Reader(self.dataset_config[0], self.dataset_config[1], s) else: self.dataset = self.reader self.raw_features = self.dataset.read() print("training:" + str(len(self.dataset.splitted_target['train']))) print("test:" + str(len(self.dataset.splitted_target['test']))) if Config.get_default('instance.selection', 'False') == 'True': self.train_X_all = copy.deepcopy( self.dataset.splitted_values['train']) self.train_y_all = copy.deepcopy( self.dataset.splitted_target['train']) #self.dataset.splitted_values['train'], self.dataset.splitted_target['train'] = sample_data_by_cnn(self.dataset.splitted_values['train'], self.dataset.splitted_target['train']) print("training:" + str(len(self.dataset.splitted_target['train']))) else: self.train_X_all = self.dataset.splitted_values['train'] self.train_y_all = self.dataset.splitted_target['train'] #rank and select features def random_select(self, k: int): arr = np.arange(len(self.candidates)) np.random.shuffle(arr) return arr[0:k] def generate_target(self): current_target = self.dataset.splitted_target['train'] if isinstance(self.classifier(), ClassifierMixin): label_encoder = LabelEncoder() label_encoder.fit(current_target) current_target = label_encoder.transform(current_target) self.test_target = None self.train_y_all_target = None if Config.get_default('score.test', 'False') == 'True': self.test_target = label_encoder.transform( self.dataset.splitted_target['test']) self.train_y_all_target = label_encoder.transform( self.train_y_all) self.preprocessed_folds = [] for train, test in StratifiedKFold( n_splits=self.folds, random_state=42).split( self.dataset.splitted_values['train'], current_target): self.preprocessed_folds.append((train, test)) elif isinstance(self.classifier(), RegressorMixin): if Config.get_default('score.test', 'False') == 'True': self.test_target = self.dataset.splitted_target['test'] self.train_y_all_target = self.train_y_all self.preprocessed_folds = [] for train, test in KFold(n_splits=self.folds, random_state=42).split( self.dataset.splitted_values['train'], current_target): self.preprocessed_folds.append((train, test)) else: pass self.target_train_folds = [None] * self.folds self.target_test_folds = [None] * self.folds for fold in range(len(self.preprocessed_folds)): self.target_train_folds[fold] = current_target[ self.preprocessed_folds[fold][0]] self.target_test_folds[fold] = current_target[ self.preprocessed_folds[fold][1]] ''' def evaluate_candidates(self, candidates: List[CandidateFeature]) -> List[CandidateFeature]: pool = mp.Pool(processes=int(Config.get_default("parallelism", mp.cpu_count()))) my_function = partial(evaluate, classifier=self.classifier, grid_search_parameters=self.grid_search_parameters, preprocessed_folds=self.preprocessed_folds, score=self.score, train_data=self.dataset.splitted_values['train'], current_target=self.current_target, train_X_all=self.train_X_all, train_y_all_target=self.train_y_all_target, test_data=self.dataset.splitted_values['test'], test_target=self.test_target) if Config.get_default("show_progess", 'True') == 'True': results = [] for x in tqdm.tqdm(pool.imap_unordered(my_function, candidates), total=len(candidates)): results.append(x) else: results = pool.map(my_function, candidates) return results ''' def evaluate_candidates(self, candidates: List[CandidateFeature], my_folds) -> List[CandidateFeature]: my_function = partial( evaluate, classifier=self.classifier, grid_search_parameters=self.grid_search_parameters, preprocessed_folds=my_folds, score=self.score, train_data=self.dataset.splitted_values['train'], current_target=self.train_y_all_target, train_X_all=self.train_X_all, train_y_all_target=self.train_y_all_target, test_data=self.dataset.splitted_values['test'], test_target=self.test_target) results = [] for can in candidates: results.append(my_function(can)) return results def evaluate_candidates_detail(self, candidates: List[CandidateFeature], my_folds, cv_jobs) -> List[CandidateFeature]: my_function = partial( evaluate, classifier=self.classifier, grid_search_parameters=self.grid_search_parameters, preprocessed_folds=my_folds, score=self.score, train_data=self.dataset.splitted_values['train'], current_target=self.train_y_all_target, train_X_all=self.train_X_all, train_y_all_target=self.train_y_all_target, test_data=self.dataset.splitted_values['test'], test_target=self.test_target, cv_jobs=cv_jobs) results = [] for can in candidates: results.append(my_function(can)) return results def evaluate_candidates_randomcv(self, candidates: List[CandidateFeature], my_folds, cv_jobs) -> List[CandidateFeature]: my_function = partial( evaluate_randomcv, classifier=self.classifier, grid_search_parameters=self.grid_search_parameters, preprocessed_folds=my_folds, score=self.score, train_data=self.dataset.splitted_values['train'], current_target=self.train_y_all_target, train_X_all=self.train_X_all, train_y_all_target=self.train_y_all_target, test_data=self.dataset.splitted_values['test'], test_target=self.test_target, cv_jobs=cv_jobs) results = [] for can in candidates: results.append(my_function(can)) return results '''
class ExploreKitSelection_iterative_search: def __init__( self, dataset_config, classifier=LogisticRegression(), grid_search_parameters={ 'classifier__penalty': ['l2'], 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'classifier__solver': ['lbfgs'], 'classifier__class_weight': ['balanced'], 'classifier__max_iter': [10000] }): self.dataset_config = dataset_config self.classifier = classifier self.grid_search_parameters = grid_search_parameters #generate all possible combinations of features def generate(self): s = Splitter(train_fraction=[0.6, 10000000], seed=42) #s = Splitter(train_fraction=[0.1, 10000000], seed=42) self.dataset = Reader(self.dataset_config[0], self.dataset_config[1], s) raw_features = self.dataset.read() g = Generator(raw_features) self.candidates = g.generate_all_candidates() print("Number candidates: " + str(len(self.candidates))) #rank and select features def random_select(self, k: int): arr = np.arange(len(self.candidates)) np.random.shuffle(arr) return arr[0:k] def generate_target(self): current_target = self.dataset.splitted_target['train'] self.current_target = LabelEncoder().fit_transform(current_target) def evaluate(self, candidate, score=make_scorer(roc_auc_score, average='micro'), folds=10): pipeline = Pipeline([('feature', FeatureUnion([(candidate.get_name(), candidate.pipeline)])), ('classifier', LogisticRegression(penalty='l2', solver='lbfgs', class_weight='balanced'))]) result = {} pipeline.fit(self.dataset.splitted_values['train'][self.train], self.current_target[self.train]) result['probability_estimations_test'] = pipeline.predict_proba( self.dataset.splitted_values['train'][self.test]) return result def create_starting_features(self): Fi: List[RawFeature] = self.dataset.raw_features #materialize and numpyfy the features starting_feature_matrix = np.zeros( (Fi[0].materialize()['train'].shape[0], len(Fi))) for f_index in range(len(Fi)): starting_feature_matrix[:, f_index] = Fi[f_index].materialize( )['train'] return starting_feature_matrix def evaluate_candidates(self, candidates): pool = mp.Pool(processes=int(Config.get("parallelism"))) results = pool.map(self.evaluate_single_candidate, candidates) return results def evaluate_single_candidate(self, candidate): result = {} time_start_gs = time.time() try: result = self.evaluate(candidate) #print("feature: " + str(candidate) + " -> " + str(new_score)) except Exception as e: print(str(candidate) + " -> " + str(e)) result['score'] = -1.0 result['hyperparameters'] = {} pass result['candidate'] = candidate result['time'] = time.time() - time_start_gs return result def filter_failing_features(self): working_features: List[CandidateFeature] = [] for candidate in self.candidates: try: candidate.fit(self.dataset.splitted_values['train']) candidate.transform(self.dataset.splitted_values['train']) except: continue working_features.append(candidate) return working_features def run(self): # generate all candidates self.generate() #starting_feature_matrix = self.create_starting_features() self.generate_target() stratifier = StratifiedKFold(n_splits=2, random_state=42) self.train, self.test = next( stratifier.split(self.dataset.splitted_values['train'], self.current_target)) results = self.evaluate_candidates(self.candidates) return results
print(len(all_representations)) return all_representations if __name__ == '__main__': from fastsklearnfeature.splitting.Splitter import Splitter import time s = Splitter(train_fraction=[0.6, 10000000]) dataset = (Config.get('statlog_heart.csv'), int(Config.get('statlog_heart.target'))) #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_27_colic_horse.csv", 22) #dataset = ("/home/felix/datasets/ExploreKit/csv/phpAmSP4g_cancer.csv", 30) #dataset = ("/home/felix/datasets/ExploreKit/csv/phpOJxGL9_indianliver.csv", 10) #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_29_credit-a_credit.csv", 15) #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_37_diabetes_diabetes.csv", 8) #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_31_credit-g_german_credit.csv", 20) #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_23_cmc_contraceptive.csv", 9) #dataset = ("/home/felix/datasets/ExploreKit/csv/phpn1jVwe_mammography.csv", 6) r = Reader(dataset[0], dataset[1], s) raw_features = r.read() g = TreeGenerator(raw_features) start_time = time.time() g.generate_candidates()