def ReprodIndividualsFromRF(list_indiv, max_id, options): list_indiv = list(list_indiv) rf = RandomForestClassifier(n_estimators=len(list_indiv)) trees = list() for indiv in list_indiv: trees.append(indiv.clf) rf.estimators_ = trees rf.n_classes_ = trees[0].n_classes_ rf.classes_ = trees[0].classes_ new_dt = eqtree_rec_rf(rf, 0, max_depth=options['max_depth'], smallest_tree=False) new_id = max_id + 1 indiv3 = genetic.individual(new_dt, new_id, type_rf=False, alpha=options['alpha'], evaluate_on_data=options['on_data'], X=options['X'], y=options['y']) return indiv3
def deserialize_random_forest(model_dict): model = RandomForestClassifier(**model_dict['params']) estimators = [deserialize_decision_tree(decision_tree) for decision_tree in model_dict['estimators_']] model.estimators_ = np.array(estimators) model.classes_ = np.array(model_dict['classes_']) model.n_features_ = model_dict['n_features_'] model.n_outputs_ = model_dict['n_outputs_'] model.max_depth = model_dict['max_depth'] model.min_samples_split = model_dict['min_samples_split'] model.min_samples_leaf = model_dict['min_samples_leaf'] model.min_weight_fraction_leaf = model_dict['min_weight_fraction_leaf'] model.max_features = model_dict['max_features'] model.max_leaf_nodes = model_dict['max_leaf_nodes'] model.min_impurity_decrease = model_dict['min_impurity_decrease'] model.min_impurity_split = model_dict['min_impurity_split'] if 'oob_score_' in model_dict: model.oob_score_ = model_dict['oob_score_'] if 'oob_decision_function_' in model_dict: model.oob_decision_function_ = model_dict['oob_decision_function_'] if isinstance(model_dict['n_classes_'], list): model.n_classes_ = np.array(model_dict['n_classes_']) else: model.n_classes_ = model_dict['n_classes_'] return model
def build_classifier(trees): def build_decision_tree(t): dt = DecisionTreeClassifier(random_state=0) dt.n_features_ = t.n_features dt.n_outputs_ = t.n_outputs dt.n_classes_ = t.n_classes[0] dt.classes_ = np.array([x for x in range(dt.n_classes_)]) dt.tree_ = t return dt if len(trees) > 1: clf = RandomForestClassifier(random_state=0, n_estimators=len(trees)) clf.estimators_ = [build_decision_tree(t) for t in trees] clf.n_features_ = trees[0].n_features clf.n_outputs_ = trees[0].n_outputs clf.n_classes_ = trees[0].n_classes[0] clf.classes_ = np.array([x for x in range(clf.n_classes_)]) else: clf = build_decision_tree(trees[0]) return clf
# Retornar uma matriz das importâncias do recurso (quanto mais alto, mais importante o recurso). ## forest.feature_importances_ print("Feature Importances") print(forest.feature_importances_) # Pontuação do conjunto de dados de treinamento obtido usando uma estimativa out-of-bag ## forest.oob_score_ print("Oob Score = " + str(forest.oob_score_)) # Função de decisão computada com estimativa fora do saco no conjunto de treinamento. # Se n_estimators for pequeno, pode ser possível que um ponto de dados nunca tenha sido # deixado de fora durante o bootstrap. Nesse caso, oob_decision_function_ pode conter NaN ## forest.oob_decision_function_ print('Oob Decision Function') print(forest.oob_decision_function_ ) """ @Method """ # Retorna a precisão média nos dados e rótulos de teste fornecidos. # Na classificação de vários rótulos, essa é a precisão do subconjunto, que é uma métrica rígida, pois # é necessário para cada amostra que cada conjunto de rótulos seja corretamente previsto. fscore = forest.score(X_train, y_train) print('Score') print(fscore) forest.estimators_ = forest.estimators_.pop() print(len(forest.estimators_))
# 原始森林 RF = RandomForestClassifier(n_estimators=rfSize) RF.fit(train_x, train_y) RF_path = model_path + '/RF.m' joblib.dump(RF, RF_path) # BRAF rf3 = RandomForestClassifier(n_estimators=rf2_size) rf3.fit(training_c_x, training_c_y) rf3_path = model_path + '/rf3.m' joblib.dump(rf3, rf3_path) RF1 = RandomForestClassifier(n_estimators=rfSize) Gobaltree = rf1.estimators_ + rf3.estimators_ RF1.estimators_ = Gobaltree RF1.classes_ = rf1.classes_ RF1.n_classes_ = rf1.n_classes_ RF1.n_outputs_ = rf1.n_outputs_ RF1_path = model_path + '/braf.m' joblib.dump(RF1, RF1_path) # DBRF RF2 = RandomForestClassifier(n_estimators=rfSize) mod_Gobaltree = rf1.estimators_ + rf2.estimators_ RF2.estimators_ = mod_Gobaltree RF2.classes_ = rf2.classes_ RF2.n_classes_ = rf2.n_classes_ RF2.n_outputs_ = rf2.n_outputs_ RF2_path = model_path + '/borderlindbscan.m' joblib.dump(RF2, RF2_path)
def app_flow(self): # This method contains a state machine for the slave and master instance # === States === state_initializing = 1 state_read_input = 2 state_share_samples = 3 state_gather_1 = 4 state_wait_1 = 5 state_train_local = 6 state_gather_2 = 7 state_wait_2 = 8 state_global_ready = 9 state_finishing = 10 # Initial state state = state_initializing self.progress = 'initializing...' while True: if state == state_initializing: if self.id is not None: # Test if setup has happened already state = state_read_input # COMMON PART if state == state_read_input: print('Reading input...') base_dir = os.path.normpath(os.path.join(f'/mnt/input/', self.split_dir)) def read_input_train(ins, path): d = pd.read_csv(path, sep=self.sep) data_X = d.drop(self.label, axis=1) data_y = d[self.label] if ins.split_test is not None: ins.data = pd.read_csv(os.path.join(base_dir, ins.input_train), sep=ins.sep) data_X_train, data_X_test, data_y_train, data_y_test = train_test_split(data_X, data_y, test_size=ins.split_test) ins.data_X_train.append(data_X_train) ins.data_y_train.append(data_y_train) ins.data_X_test.append(data_X_test) ins.data_y_test.append(data_y_test) else: ins.data_X_train.append(data_X) ins.data_y_train.append(data_y) def read_input_test(ins, path): d = pd.read_csv(path, sep=ins.sep) data_X = d.drop(ins.label, axis=1) data_y = d[ins.label] ins.data_X_test.append(data_X) ins.data_y_test.append(data_y) if self.split_mode == 'directory': for split_name in os.listdir(base_dir): read_input_train(self, os.path.join(base_dir, split_name, self.input_train)) if self.input_test is not None: read_input_test(self, os.path.join(base_dir, split_name, self.input_test)) elif self.split_mode == 'file': read_input_train(self, os.path.join(base_dir, self.input_train)) if self.input_test is not None: read_input_test(self, os.path.join(base_dir, self.input_test)) split_samples = [i.shape[0] for i in self.data_y_train] self.my_samples = sum(split_samples) // len(split_samples) print(f'Read input. Have {split_samples} samples.') if self.master: self.data_incoming.append(pickle.dumps({ 'samples': self.my_samples })) state = state_gather_1 else: self.data_outgoing = pickle.dumps({ 'samples': self.my_samples }) self.status_available = True state = state_wait_1 if state == state_train_local: print('Calculate local values...') rfs = [] for i in range(len(self.data_X_train)): global_rf = None trees = int(self.estimators_total * self.my_samples / self.total_samples) if self.mode == 'classification': global_rf = RandomForestClassifier(n_estimators=trees, random_state=self.random_state) elif self.mode == 'regression': global_rf = RandomForestRegressor(n_estimators=trees, random_state=self.random_state) global_rf.fit(self.data_X_train[i], self.data_y_train[i]) rfs.append({ 'rf': global_rf, }) print(f'Trained random forests') if self.master: self.data_incoming.append(pickle.dumps(rfs)) state = state_gather_2 else: self.data_outgoing = pickle.dumps(rfs) self.status_available = True state = state_wait_2 if state == state_global_ready: print(f'Forest done') results_pred = [] results_proba = [] results_test = [] for i in range(len(self.data_X_train)): results_pred.append(self.rfs[i].predict(self.data_X_test[i])) if self.mode == 'classification': results_proba.append(self.rfs[i].predict_proba(self.data_X_test[i])) results_test.append(self.data_y_test[i]) def write_output(path, data): df = pd.DataFrame(data=data) df.to_csv(path, index=False, sep=self.sep) print(f'Writing output') base_dir_in = os.path.normpath(os.path.join(f'/mnt/input/', self.split_dir)) base_dir_out = os.path.normpath(os.path.join(f'/mnt/output/', self.split_dir)) if self.split_mode == 'directory': for i, split_name in enumerate(os.listdir(base_dir_in)): write_output(os.path.join(base_dir_out, split_name, self.output_pred), {'pred': results_pred[i][:]}) if self.mode == 'classification': write_output(os.path.join(base_dir_out, split_name, self.output_proba), {'prob_0': results_proba[i][:, 0], 'prob_1': results_proba[i][:, 1]}) write_output(os.path.join(base_dir_out, split_name, self.output_test), {'y_true': results_test[i]}) elif self.split_mode == 'file': write_output(os.path.join(base_dir_out, self.output_pred), {'pred': results_pred[0][:]}) if self.mode == 'classification': write_output(os.path.join(base_dir_out, self.output_proba), {'prob_0': results_proba[0][:, 0], 'prob_1': results_proba[0][:, 1]}) write_output(os.path.join(base_dir_out, self.output_test), {'y_true': results_test[0]}) if self.master: self.data_incoming.append('DONE') state = state_finishing else: self.data_outgoing = 'DONE' self.status_available = True break # GLOBAL PART if state == state_gather_1: if len(self.data_incoming) == len(self.clients): client_data = [] for local_rfs in self.data_incoming: client_data.append(pickle.loads(local_rfs)) self.data_incoming = [] total_samples = sum([cd['samples'] for cd in client_data]) self.total_samples = total_samples self.data_outgoing = pickle.dumps(total_samples) self.status_available = True state = state_train_local else: print(f'Have {len(self.data_incoming)} of {len(self.clients)} so far, waiting...') if state == state_gather_2: if len(self.data_incoming) == len(self.clients): client_data = [] for local_rfs in self.data_incoming: client_data.append(pickle.loads(local_rfs)) self.data_incoming = [] data_outgoing = [] for i in range(len(self.data_X_train)): global_rf = None # total_samples = 0 # for d in client_data: # total_samples += d[i]['samples'] for d in client_data: drf = d[i]['rf'] # perc = d[i]['samples'] / total_samples # trees = int(perc * self.estimators_total) if global_rf is None: global_rf = drf global_rf.estimators_ = drf.estimators_ # global_rf.estimators_ = random.sample(drf.estimators_, trees) global_rf.n_estimators = drf.n_estimators else: global_rf.estimators_ += drf.estimators_ # global_rf.estimators_ += random.sample(drf.estimators_, trees) global_rf.n_estimators += drf.n_estimators data_outgoing.append(global_rf) self.rfs = data_outgoing self.data_outgoing = pickle.dumps(data_outgoing) self.status_available = True state = state_global_ready else: print(f'Have {len(self.data_incoming)} of {len(self.clients)} so far, waiting...') if state == state_finishing: if len(self.data_incoming) == len(self.clients): self.status_finished = True break # LOCAL PART if state == state_wait_1: if len(self.data_incoming) > 0: self.total_samples = pickle.loads(self.data_incoming[0]) self.data_incoming = [] state = state_train_local if state == state_wait_2: if len(self.data_incoming) > 0: self.rfs = pickle.loads(self.data_incoming[0]) self.data_incoming = [] state = state_global_ready time.sleep(1)
local_dict.append(w) local_dict = sorted(local_dict) ############################# Part 3: Linearity ########################## # random forest 1 rf1 = RandomForestClassifier(random_state=10) rf1.fit(train_vectors, y_train) # random forest 2 rf2 = RandomForestClassifier(random_state=15) rf2.fit(train_vectors, y_train) # random forest 3 rf3 = RandomForestClassifier(random_state=22) rf3.estimators_ = rf1.estimators_ + rf2.estimators_ rf3.n_classes_ = rf1.n_classes_ # model 1 def model_rf1(data): n_data = len(data) res = np.zeros((n_data, 2)) tfidf = vectorizer.transform(data) p = rf1.predict_proba(tfidf) res[:, 0] = p[:, 1] res[:, 1] = p[:, 1] return res # model 2 def model_rf2(data): n_data = len(data)