def test_multi_output_exceptions(): # NotFittedError when fit is not done but score, predict and # and predict_proba are called moc = MultiOutputClassifier(LinearSVC(random_state=0)) with pytest.raises(NotFittedError): moc.predict(y) with pytest.raises(NotFittedError): moc.predict_proba with pytest.raises(NotFittedError): moc.score(X, y) # ValueError when number of outputs is different # for fit and score y_new = np.column_stack((y1, y2)) moc.fit(X, y) with pytest.raises(ValueError): moc.score(X, y_new) # ValueError when y is continuous msg = "Unknown label type" with pytest.raises(ValueError, match=msg): moc.fit(X, X[:, 1])
def obtain_optimum_parameters(hidden_layers, neurons_per_layer, times=3): """ Se calcula la precisión de distintos perceptrones multicapas. Para cada número de capas y número de neuronas por capa se ajusta un modelo distinto. Todas las capas de un MLP tienen igual cantidad de neuronas. Los resultados de los modelos se almacenan en un archivo json. :param hidden_layers: lista número de capas ocultas :param times número de veces que se corre un mismo mlp :param neurons_per_layer: lista de número de neuronas por capa """ logging.info("Calculando el número de capas ocultas y neuronas óptimo...") # Dataframe que contendrá los resultados df = pd.DataFrame(index=neurons_per_layer, columns=hidden_layers) for layers in hidden_layers: logging.info("MLP's con {} capas ...".format(layers)) for neurons in neurons_per_layer: hidden_layer_sizes = [neurons for _ in range(layers)] logging.info( 'Entrenándose MLP con {} capas y {} neuronas por capa oculta'. format(layers, neurons)) # Se entrena el modelo y se obtiene un promedio del score score = 0 for _ in range(times): mlp = MLPClassifier(activation='logistic', solver='lbfgs', hidden_layer_sizes=hidden_layer_sizes, max_iter=1000) multi_mlp = MultiOutputClassifier(mlp, n_jobs=None) multi_mlp.fit( train_tfidf, train_data[[ 'arriendo', 'profesional', 'tipo_construccion_id' ]]) score += multi_mlp.score( test_tfidf, test_data[[ 'arriendo', 'profesional', 'tipo_construccion_id' ]]) logging.info("Score acumulado: {}".format(score)) df.loc[neurons, layers] = score / times print('Precisión: {}'.format(df.loc[neurons, layers])) # Se almacena el dataframe en un archivo json df.to_json('neurons_per_layer.json', orient='columns') # Gráfico df.rename(lambda x: '{} capas ocultas'.format(x), axis='columns', inplace=True) df.plot.line(marker='o') plt.xlabel("n° neuronas en cada capa oculta") plt.ylabel("precisión") plt.show()
def logistic_regression( dataframe: pandas.DataFrame, inputs: Iterable[str], outputs: Iterable[str], ) -> LogisticRegressionResult: """Extract the dataset variables columns and fit them in a logistic regression model. Arguments: dataset: the dataset to run the logistic regression against inputs: list of input variable names outputs: list of output variable names Returns: A result that contains the values of the coefiecients and intercepts """ x = dataframe[inputs] y = dataframe[outputs].copy() encoders = [LabelEncoder() for _ in outputs] for encoder, output in zip(encoders, outputs): y[output] = encoder.fit_transform(y[output]) reg = MultiOutputClassifier(LogisticRegression(max_iter=1000)).fit(x, y) # type: ignore outcomes = [encoder.classes_.tolist() for encoder in encoders] coefs = [estimator.coef_.tolist() for estimator in reg.estimators_] intercepts = [estimator.intercept_.tolist() for estimator in reg.estimators_] score = float(reg.score(x, y)) return LogisticRegressionResult( outcomes=outcomes, coefs=coefs, intercepts=intercepts, score=score, )
class GOClassifier: def __init__(self, X, y, random_seed=11, test_size=0.25, *args, **kwargs): ind = np.arange(X.shape[0]) np.random.seed(random_seed) np.random.shuffle(ind) self.X = X[ind] self.y = y[ind] self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( X, y, test_size=test_size, random_state=random_seed) self.random_seed = random_seed self.args = args self.kwargs = kwargs self.clf = None def fit(self, X=None, y=None): X_ = X if X is not None else self.X_train y_ = y if y is not None else self.y_train self.clf = MultiOutputClassifier( SGDClassifier(alpha=0.0001, max_iter=1000, tol=1e-3, random_state=self.random_seed, *self.args, **self.kwargs)) self.clf.fit(X_, y_) return self.clf def predict(self, X=None): assert self.clf is not None X_ = X if X is not None else self.X return self.clf.predict(X_) def test_predict(self): return self.predict(X=self.X_test) def score(self, X, y): assert self.clf is not None return self.clf.score(X, y) def test_score(self): assert self.clf is not None return self.clf.score(self.X_test, self.y_test) def train_score(self): assert self.clf is not None return self.clf.score(self.X_train, self.y_train)
def SVM(self): """ """ class_multisvm = MultiOutputClassifier(SVM(kernel='rbf')) # Fit on the train data class_multisvm.fit(self.X_train, self.y_train) # Check the prediction score score = class_multisvm.score(self.X_test, self.y_test) print("The prediction score on the test data is {:.2f}%".format(score*100))
def RF(self): class_multirf = MultiOutputClassifier(RandomForestClassifier(max_depth=30, random_state=0)) # Fit on the train data class_multirf.fit(self.X_train, self.y_train) # Check the prediction score score = class_multirf.score(self.X_test, self.y_test) print("The prediction score on the test data is {:.2f}%".format(score*100))
def multioutput_classifier(self, dt_final, x_train1, x_test1, y_train1, y_test1): model = MultiOutputClassifier(dt_final) model.fit(x_train1, y_train1) # training the model this could take a little time accuracy = model.score( x_test1, y_test1) # comparing result with the test part set data = { 'Accuracy': [accuracy], 'Algorithm': ['DecisionTreeClassifier'] } algorithm_output = pd.DataFrame(data) print("===============================") print("Training Accuracy Using Multi label during train:") print(algorithm_output) return model
def KNN(X_train, x_test, y_train, y_test): knn = KNeighborsClassifier(algorithm='auto', metric='minkowski', metric_params=None, n_jobs=-1, n_neighbors=147, p=2, weights='distance') print("poopf") knn.fit(X_train, y_train) classifier = MultiOutputClassifier(knn, n_jobs=-1) classifier.fit(X_train, y_train) y_predict = (classifier.predict_proba(x_test)) output = np.zeros((1967, 147)) #2597 for x in range(1967): for y in range(147): output[x][y] = y_predict[y][x][1] # print(output) # np.savetxt("sub.csv", output, delimiter=",") print(classifier.score(output, y_test))
x = np.load('./data/x_data.npy') y = np.load('./data/y_data.npy') x_pred = np.load('./data/x_pred.npy') print("x.shape :", x.shape) print("y.shape :", y.shape) print("x_pred.shape :", x_pred.shape) x = x.reshape(x.shape[0], 64 * 64 * 3) x_pred = x_pred.reshape(x_pred.shape[0], 64 * 64 * 3) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=77, shuffle=True) # model = XGBClassifier() model = MultiOutputClassifier(XGBRFClassifier()) # 3. 훈련 model.fit(x_train, y_train) # 4. 평가, 예측 acc = model.score(x_test, y_test) print("acc :", acc) y_pred = model.predict(x_pred)
######################################## if ML_option == "Logistic Regression": # Fit the model and predict X_test. Show some analysis. try: logReg = MultiOutputClassifier(LogisticRegression()) logReg.fit(X_train, y_train) pred = logReg.predict(X_test) st.write('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, pred), 4)) st.write('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, pred), 4)) st.write('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, pred)), 4)) st.write('Accuracy of Logistic Regression on training set: ', round(logReg.score(X_train, y_train), 4)) st.write('Accuracy of Logistic Regression on test set: ', round(logReg.score(X_test, y_test), 4)) st.subheader("Classification Report") st.text(classification_report(y_test, pred)) try: # Confusion matrix plot_confusion_matrix(y_test, pred, figsize=(7, 5), cmap="PuBuGn") bottom, top = plt.ylim() plt.ylim(bottom + 0.5, top - 0.5) st.pyplot() except: st.write("Confusion matrix do not support multioutput.")
from .MailCategorizator import Preprocessor from sklearn.ensemble import RandomForestClassifier from sklearn.multioutput import MultiOutputClassifier from sklearn.preprocessing import MultiLabelBinarizer from sklearn.model_selection import train_test_split import pickle if __name__ == '__main__': preprocessor = Preprocessor() binarizer = MultiLabelBinarizer() clf = MultiOutputClassifier(RandomForestClassifier(), n_jobs=-1) X = preprocessor.build_tfidf_matrix() y = binarizer.fit_transform(preprocessor.get_target()) x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25) clf.fit(x_train, y_train) print(clf.score(x_test, y_test)) pickle.dump(clf, '../data/pickles/') pickle.dump(preprocessor, '../data/pickles/')
class Team: def __init__(self, team_name, play_by_play_df): self.team = team_name self.team_df = play_by_play_df[play_by_play_df['posteam'] == self.team] self._generate_lists() self.valid_play_dict = { 'Pass': 0, 'Run': 1, 'Punt': 2, 'Field Goal': 3 } self.valid_play_inv_dict = { 0: 'Pass', 1: 'Run', 2: 'Punt', 3: 'Field Goal' } self.X = [] self.Y = [] def train_classifier(self, debug_classifier=False): self._organize_training_data() self._generate_random_forest(debug_classifier) def _generate_random_forest(self, debug_classifier): self.forest = RandomForestClassifier(n_estimators=100, random_state=1) self.multi_target_forest = MultiOutputClassifier(self.forest, n_jobs=-1) X_train, X_test, Y_train, Y_test = train_test_split(self.X, self.Y, test_size=0.1, random_state=0) self.multi_target_forest.fit(X_train, Y_train) forests = self.multi_target_forest.estimators_ forest0_feat = forests[0].feature_importances_.tolist() forest1_feat = forests[1].feature_importances_.tolist() forest2_feat = forests[2].feature_importances_.tolist() forest3_feat = forests[3].feature_importances_.tolist() feature_df = pd.DataFrame( data={ 'Features': [x for x in range(5)], 'Forest0': forest0_feat, 'Forest1': forest1_feat, 'Forest2': forest2_feat, 'Forest3': forest3_feat }) if debug_classifier == True: print('Training Score: ', self.multi_target_forest.score(X_train, Y_train)) print('Test Score: ', self.multi_target_forest.score(X_test, Y_test)) fig1 = plt.figure() ax = fig1.add_subplot(111) width = 0.1 feature_df.Forest0.plot(kind='bar', color='red', ax=ax, width=width, position=-1) feature_df.Forest1.plot(kind='bar', color='green', ax=ax, width=width, position=0) feature_df.Forest2.plot(kind='bar', color='blue', ax=ax, width=width, position=1) feature_df.Forest3.plot(kind='bar', color='yellow', ax=ax, width=width, position=2) ax.set_xticklabels([ 'Yards to First', 'Down', 'Quarter', 'Yardline', 'Score Diff' ], rotation=0) ax.set_xlabel('Features') ax.set_ylabel('Feature Importance') ax.set_title('Random Forest - Feature Analysis') plt.xlim(-0.5, 4.5) plt.legend(['Pass', 'Run', 'Punt', 'Field Goal']) plt.show() def test_classifier(self, yards_to_go, down, quarter, yard_line, score_diff): input_array = np.array( [yards_to_go, down, quarter, yard_line, score_diff]) prediction = self.multi_target_forest.predict_proba( input_array).tolist() prediction = prediction[0][1] return np.argmax(prediction) def _generate_lists(self): self.play_type = self.team_df['PlayType'].values.tolist() self.game_ID = self.team_df['GameID'].values.tolist() self.drive = self.team_df['Drive'].values.tolist() self.quarter = self.team_df['qtr'].values.tolist() self.down = self.team_df['down'].values.tolist() self.time = self.team_df['time'].values.tolist() self.pos_team = self.team_df['posteam'].values.tolist() self.def_team = self.team_df['DefensiveTeam'].values.tolist() self.pass_length = self.team_df['PassLength'].values.tolist() self.pass_location = self.team_df['PassLocation'].values.tolist() self.pass_attempt = self.team_df['PassAttempt'].values.tolist() self.air_yards = self.team_df['AirYards'].values.tolist() self.rush_attempt = self.team_df['RushAttempt'].values.tolist() self.run_location = self.team_df['RunLocation'].values.tolist() self.run_gap = self.team_df['RunGap'].values.tolist() self.fieldgoal_distance = self.team_df[ 'FieldGoalDistance'].values.tolist() self.pos_team_score = self.team_df['PosTeamScore'].values.tolist() self.def_team_score = self.team_df['DefTeamScore'].values.tolist() self.yrdline100 = self.team_df['yrdline100'].values.tolist() self.yrds_to_go = self.team_df['ydstogo'].values.tolist() def _organize_training_data(self): score_diff_list = np.array(self.pos_team_score) - np.array( self.def_team_score) zipped_data = zip(self.quarter, self.down, self.yrdline100, self.yrds_to_go, score_diff_list, self.play_type) for quarter, down, yrdln, yrds_to_go, score_diff, play_type in zipped_data: input_list = [yrds_to_go, down, quarter, yrdln, score_diff] if not np.any(np.isnan( input_list)) and play_type in self.valid_play_dict: output_list = [0 for _ in range(4)] output_list[self.valid_play_dict[play_type]] = 1 self.X.append(input_list) self.Y.append(output_list) self.X = np.array(self.X) self.Y = np.array(self.Y) def generate_success_probabilities(self, opponent, yr, debug_probs=False): ############################## # Extract Team Specific Data # ############################## self.opponent = opponent valid_dates = [ str(yr) + '-' + '09', str(yr) + '-' + '10', str(yr) + '-' + '11', str(yr) + '-' + '12', str(yr + 1) + '-' + '01' ] coach_yr_09_df = self.team_df[self.team_df['\ufeffDate'].str.contains( valid_dates[0])] coach_yr_10_df = self.team_df[self.team_df['\ufeffDate'].str.contains( valid_dates[1])] coach_yr_11_df = self.team_df[self.team_df['\ufeffDate'].str.contains( valid_dates[2])] coach_yr_12_df = self.team_df[self.team_df['\ufeffDate'].str.contains( valid_dates[3])] coach_yr_01_df = self.team_df[self.team_df['\ufeffDate'].str.contains( valid_dates[4])] coach_yr_df = pd.concat([ coach_yr_09_df, coach_yr_10_df, coach_yr_11_df, coach_yr_12_df, coach_yr_01_df ]) team_prob_df = coach_yr_df[coach_yr_df['DefensiveTeam'] == self.opponent] loc_pass_outcome = team_prob_df['PassOutcome'].values.tolist() loc_yrds_gained = team_prob_df['Yards.Gained'].values.tolist() loc_play_type = team_prob_df['PlayType'].values.tolist() loc_interception = team_prob_df['InterceptionThrown'].values.tolist() loc_play_type_fumble = coach_yr_df['PlayType'].values.tolist() loc_fumble = coach_yr_df['Fumble'].values.tolist() loc_drive = coach_yr_df['Drive'].values.tolist() loc_gameID = coach_yr_df['GameID'].values.tolist() loc_fg_success = coach_yr_df['FieldGoalResult'] loc_fg_distance = coach_yr_df['yrdline100'] loc_fg_play_type = coach_yr_df['PlayType'] loc_punt_spot = coach_yr_df['yrdline100'] loc_punt_return = coach_yr_df['Return_spot'] loc_time_elapsed = coach_yr_df['Elapsed_Play_Time'] ######################## # Initialize Variables # ######################## self.elapsed_time = { 'punt': [], 'run': [], 'pass_good': [], 'pass_nogood': [], 'fg': [] } self.total_passes = 0 self.total_completions = 0 self.pass_list = [] self.rush_list = [] self.pass_or_sack = 0 self.num_sacks = 0 self.sack_dist = [] self.total_interceptions = 0 field_goal_attempts = {0: 0, 10: 0, 20: 0, 30: 0, 40: 0, 50: 0, 60: 0} field_goal_successes = {0: 0, 10: 0, 20: 0, 30: 0, 40: 0, 50: 0, 60: 0} self.field_goal_pct = {} total_runs = 0 total_run_fumbles = 0 total_pass = 0 total_pass_fumbles = 0 self.punt_dist = [] punt_touchback = { 90: 0, 80: 0, 70: 0, 60: 0, 50: 0, 40: 0, 30: 0, 20: 0 } punt_kickrange = { 90: 0, 80: 0, 70: 0, 60: 0, 50: 0, 40: 0, 30: 0, 20: 0 } punt_total = 0 ##################### # Punt Calculations # ##################### for punt_spot, return_spot, time in zip(loc_punt_spot, loc_punt_return, loc_time_elapsed): if np.isnan(punt_spot) == False and np.isnan(return_spot) == False: punt_total += 1 punt_range = np.floor(punt_spot / 10) * 10 punt_kickrange[punt_range] += 1 if return_spot == 80: punt_touchback[punt_range] += 1 else: self.punt_dist.append(return_spot - (100 - punt_spot)) if np.isnan(time) == False: self.elapsed_time['punt'].append(time) self.punt_alpha, self.punt_loc, self.punt_beta = stats.gamma.fit( self.punt_dist) punt_x = np.arange(-10, 80, 1) g3 = gamma.pdf(x=punt_x, a=self.punt_alpha, loc=self.punt_loc, scale=self.punt_beta) self.punt_touchback_pct = {} for key, value in punt_kickrange.items(): if value != 0: self.punt_touchback_pct[key] = punt_touchback[key] / value ########################### # Field Goal Calculations # ########################### for fg_success, fg_distance, fg_play_type, time in zip( loc_fg_success, loc_fg_distance, loc_fg_play_type, loc_time_elapsed): if fg_play_type == 'Field Goal': marker = np.floor(fg_distance / 10) * 10 if marker is not None: if np.isnan(time) == False: self.elapsed_time['fg'].append(time) field_goal_attempts[marker] += 1 if fg_success == 'Good': field_goal_successes[marker] += 1 for key, value in field_goal_attempts.items(): if value > 0: self.field_goal_pct[key] = field_goal_successes[key] / value else: self.field_goal_pct[key] = 0 ####################### # Fumble Calculations # ####################### for i, fumble in enumerate(loc_fumble): current_game = loc_gameID[i] current_drive = loc_drive[i] if loc_play_type_fumble[i] == 'Pass': total_pass += 1 if fumble == 1: if loc_gameID[i + 1] == current_game: if loc_drive[i + 1] == current_drive or loc_drive[ i + 1] == current_drive + 1: pass else: total_pass_fumbles += 1 elif loc_play_type_fumble[i] == 'Run': total_runs += 1 if fumble == 1: if loc_gameID[i + 1] == current_game: if loc_drive[i + 1] == current_drive or loc_drive[ i + 1] == current_drive + 1: pass else: total_run_fumbles += 1 self.pass_fumble_pct = total_pass_fumbles / total_pass self.run_fumble_pct = total_run_fumbles / total_runs ############################# # Pass and Run Calculations # ############################# for pass_outcome, yrds_gained, play_type, interception, time in zip( loc_pass_outcome, loc_yrds_gained, loc_play_type, loc_interception, loc_time_elapsed): if play_type == 'Pass' or play_type == 'Sack': self.pass_or_sack += 1 if play_type == 'Sack': self.num_sacks += 1 self.sack_dist.append(yrds_gained) if play_type == 'Pass': self.total_passes += 1 if pass_outcome == "Complete": self.total_completions += 1 self.pass_list.append(yrds_gained) if np.isnan(time) == False: self.elapsed_time['pass_good'].append(time) else: if np.isnan(time) == False: self.elapsed_time['pass_nogood'].append(time) if interception == 1: self.total_interceptions += 1 elif play_type == 'Run': if np.isnan(time) == False: self.elapsed_time['run'].append(time) self.rush_list.append(yrds_gained) self.time_kde = {} self.time_kde['pass_good'] = stats.gaussian_kde( self.elapsed_time['pass_good'], bw_method=.2) self.time_kde['pass_nogood'] = stats.gaussian_kde( self.elapsed_time['pass_nogood'], bw_method=.2) self.time_kde['punt'] = stats.gaussian_kde(self.elapsed_time['punt'], bw_method=.2) self.time_kde['run'] = stats.gaussian_kde(self.elapsed_time['run'], bw_method=.2) self.time_kde['fg'] = stats.gaussian_kde(self.elapsed_time['fg'], bw_method=.2) self.pass_complete_pct = self.total_completions / self.total_passes self.pass_alpha, self.pass_loc, self.pass_beta = stats.gamma.fit( self.pass_list) self.run_alpha, self.run_loc, self.run_beta = stats.gamma.fit( self.rush_list) self.sack_pct = self.num_sacks / self.pass_or_sack self.sack_yrds_mean = np.mean(self.sack_dist) self.sack_yrds_std = np.std(self.sack_dist) self.interception_pct = self.total_interceptions / self.total_passes ############# # Debugging # ############# if debug_probs == True: pass_x = np.arange(0, 40, .1) g1 = gamma.pdf(x=pass_x, a=self.pass_alpha, loc=self.pass_loc, scale=self.pass_beta) run_x = np.arange(-10, 20, .1) g2 = gamma.pdf(x=run_x, a=self.run_alpha, loc=self.run_loc, scale=self.run_beta) fig2 = plt.figure() ax1 = fig2.add_subplot(2, 1, 1) ax1.plot(pass_x, g1) ax1.hist(self.pass_list, bins=20, normed=True) ax1.set_xlabel('Pass Yards') ax1.set_ylabel('Probability') ax2 = fig2.add_subplot(2, 1, 2) ax2.plot(run_x, g2) ax2.hist(self.rush_list, 20, normed=True) ax2.set_xlabel('Rush Yards') ax2.set_ylabel('Probability') fig2.show() fig3 = plt.figure() ax3 = fig3.add_subplot(1, 1, 1) ax3.plot(punt_x, g3) ax3.hist(self.punt_dist, bins=20, normed=True) fig3.show() fig6 = plt.figure() ax6 = fig6.add_subplot(1, 1, 1) print('TIMES', self.elapsed_time) for key, value in self.elapsed_time.items(): ax6.hist(value, histtype='step', label=key) ax6.legend() fig6.show()
X_train, X_test, y_train, y_test = train_test_split( cat_labelled_data[headers], cat_labelled_data[category_dummies_prefix.columns], test_size=0.2, random_state=42) X_train = X_train[attributes_of_interest] X_test = X_test[attributes_of_interest] y_train = y_train.to_numpy() clf.set_params(max_depth=i) print(f'Modèle {name} {i}') print('-- Entrainement') classifier = MultiOutputClassifier(clf, n_jobs=-1) classifier.fit(X_train, y_train) train_score = classifier.score(X_train, y_train) print(f'Score d\'entraînement: {train_score}') print('-- Test') test_predictions = classifier.predict(X_test) test_score = classifier.score(X_test, y_test) print(f'Score de test: {test_score}') if test_score > best_model.get('score'): best_model['name'], best_model['score'], best_model[ 'model'] = name + ' ' + str(i), test_score, classifier print( f"Le modèle présentant le meilleur score est {best_model.get('name')} avec {best_model.get('score')}" )
rf = RandomForestClassifier(random_state=42) # 랜덤포레스트 분류기 rf.fit(X_train, y_train) # train data에 random forest model 학습 # rf_predictions = rf.predict(X_test) # 학습된 모델에 X_test 값을 넣어 y_test 예측 값 생성 # print(rf_predictions) # 분류기 평가 - gridsearchcv 전 rf model from sklearn.multioutput import MultiOutputClassifier # 멀티 출력 가능하게 하는 패키지 설치 rf_classifier = MultiOutputClassifier(rf, n_jobs=1) rf_classifier.fit(X_train, y_train) # 다중출력이 가능한 모델에 train data 학습 rf_predictions2 = rf_classifier.predict(X_test) # 학습된 모델에 X_test 넣어서 y_test 예측 print(rf_predictions2) print(rf_classifier.score(X_train, y_train)) # 훈련 데이터 셋 정확도 94.91% # GridSearchCV : 교차검증과 최적의 파라미터를 동시에 진행 from sklearn.model_selection import GridSearchCV param_grid = {'n_estimators': [50, 100, 200, 300], 'max_depth': [5, 10, 20]} forest_reg = RandomForestClassifier() grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error') print(grid_search.fit(X_train, y_train)) print('최고 평균 정확도 : {0:.4f}'.format(grid_search.best_score_)) print('GridSearchCV 최적 파라미터 : ', grid_search.best_params_)
def pedicting_tag(request): print 'inside predicting tag' class lemmatokenizer(object): def __init__(self): self.stemmer = SnowballStemmer('english') self.token_pattern = r"(?u)\b\w\w+\b" # self.wnl = WordNetLemmatizer() def __call__(self,doc): # here, doc is one string sentence token_pattern = re.compile(self.token_pattern) return [self.stemmer.stem(t) for t in token_pattern.findall(doc)] # return lambda doc: token_pattern.findall(doc) # return [self.wnl.lemmatize(t) for t in word_tokenize(doc)] vect_title = CountVectorizer(max_df=0.5,min_df=5,stop_words='english',tokenizer=lemmatokenizer(),ngram_range=(1,3)) # In[9]: tfidf_vect_title = TfidfVectorizer(smooth_idf=False,max_df=0.5,min_df=5,stop_words='english',tokenizer=lemmatokenizer(),ngram_range=(1,3)) le = preprocessing.LabelEncoder() le.fit(y_labels) d_set['label_num'] = pd.Series([le.transform(ast.literal_eval(i)) for i in d_set['tag']]) d_set.head() new_y_labels = d_set['label_num'].values.tolist() mlb = MultiLabelBinarizer() mlb.fit(new_y_labels) y_tag_dtm = mlb.transform(new_y_labels) y_tag_dtm.shape # In[14]: X_labels = d_set['title'].values.tolist() # print (X_labels) # In[15]: vect_title.fit(X_labels) X_title_dtm = vect_title.transform(X_labels) X_title_dtm from sklearn.decomposition import PCA pca = PCA(n_components=100).fit(X_title_dtm.toarray()) pca_samples = pca.transform(X_title_dtm.toarray()) pca_df = pd.DataFrame(np.round(pca_samples,4)) print (pca_df.head()) # In[ ]: # In[17]: new_df = pd.DataFrame(X_title_dtm.toarray(),columns=vect_title.get_feature_names()) new_df.shape d = collections.Counter(vect_title.get_feature_names()) new_df['target_list'] = [i for i in y_tag_dtm] tfidf_vect_title.fit(X_labels) X_title_dtm_tfidf = tfidf_vect_title.transform(X_labels) X_title_dtm_tfidf # In[23]: new_df_of_tfidf = pd.DataFrame(X_title_dtm_tfidf.toarray(),columns=tfidf_vect_title.get_feature_names()) # In[24]: new_df_of_tfidf['target_list'] = [i for i in y_tag_dtm] # In[25]: y = new_df_of_tfidf['target_list'] X = new_df_of_tfidf.drop('target_list',axis=1) X = np.array(X.values.tolist()) # it will convert list to numpy ndarray y = np.array(y.values.tolist()) # In[28]: # print (X[0]) # In[29]: pca_X = PCA(n_components=200).fit_transform(X) pca_X = np.round(pca_X,4) pca_y = PCA(n_components=50).fit_transform(y) pca_y = np.round(pca_y,4) # In[30]: print (pca_y) # In[31]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # In[32]: # X_train, X_test, y_train, y_test = train_test_split(pca_X, pca_y, test_size=0.2, random_state=1) # In[ ]: # In[33]: # clf = Pipeline([('classifier',OneVsRestClassifier(SVC(probability=True,random_state=0)))]) # just to for Pipeline example knn_clf = KNeighborsClassifier(n_neighbors=5) # mnb_clf = MultinomialNB() # not working for MultiLabelinput # svc_clf = OneVsRestClassifier(SVC(probability=True,random_state=0)) # time_pass_y = np.random.randint(2,size=(2838,1)) # produce ndarray of size 2838 X 1 knn_clf.fit(X_train, y_train) # mnb_clf.fit(X_train, y_train) knn_pred = knn_clf.predict(X_test) # mnb_pred = mnb_clf.predict(X_test) # svc_pred = svc_clf.predict(X_test) # In[34]: knn_clf.score(X_test, y_test) # In[53]: from sklearn import metrics knn_report = metrics.classification_report(y_test[:100], knn_pred[:100]) knn_f1_score = metrics.f1_score(y_test[:], knn_pred[:], average='samples') knn_precision_recall_fscore = metrics.precision_recall_fscore_support(y_test, knn_pred, average='samples') # on full data-set knn_avg_precision_score = metrics.average_precision_score(y_test, knn_pred, average='samples') knn_roc_auc_score = metrics.roc_auc_score(y_test, knn_pred, average='samples') # mnb_report = metrics.classification_report(y_test[:100], mnb_pred[:100]) #throwing error mnb_clf can't work on multilabel O/P # In[36]: metrics.accuracy_score(y_true=y_test[:100], y_pred=knn_pred[:100]) # I think it's same as calculating hamming_score # In[37]: # print (knn_report) # its type is str print "For knn_clf (KNearestNeighbours) : " print "precision, recall, fbeta_score, support : ",knn_precision_recall_fscore print "f1_score : ",knn_f1_score print "avg. precision_score : ",knn_avg_precision_score print "roc_auc_score : ",knn_roc_auc_score # In[38]: # def does_test_tag_match(d, list_of_tags): # no need for this function # In[39]: test = ["how to use policy iteration in ml ?"] # test = ["what is lstm ?"] # test_dtm = vect_title.transform(test) # without tfidf test_dtm = tfidf_vect_title.transform(test) # with tfidf # print (test_dtm.toarray()[0]) status = False for i in test_dtm.toarray()[0]: if (i!=0): status = True break ans = knn_clf.predict(test_dtm.toarray()) ans = mlb.inverse_transform(ans) if (len(ans[0])==0 or status==False): print ("sorry, we can't predict your category!!!") else: ans = le.inverse_transform(ans) print (ans) forest = RandomForestClassifier(n_estimators=100, random_state=0) rf_clf = MultiOutputClassifier(forest, n_jobs=-1) rf_clf.fit(X_train, y_train) rf_pred = rf_clf.predict(X_test) # In[41]: rf_clf # In[42]: metrics.accuracy_score(y_true=y_test[:100], y_pred=rf_pred[:100]) # I think it's same as calculating hamming_score # In[43]: rf_clf.score(X_test, y_test) rf_report = metrics.classification_report(y_test[:100], rf_pred[:100]) rf_f1_score = metrics.f1_score(y_test, rf_pred, average='samples') rf_precision_recall_fscore = metrics.precision_recall_fscore_support(y_test, rf_pred, average='samples') # on full data-set rf_avg_precision_score = metrics.average_precision_score(y_test, rf_pred, average='samples') rf_roc_auc_score = metrics.roc_auc_score(y_test, rf_pred, average='samples') # In[47]: # print (rf_report) print "For rf_clf (RandomForest) : " print "precision, recall, fbeta_score, support : ",rf_precision_recall_fscore print "f1_score : ",rf_f1_score print "avg. precision_score : ",rf_avg_precision_score print "roc_auc_score : ",rf_roc_auc_score # test = ["what is reinforcement learning ?"] test = ["what is ai,lstm and data visualization ?"] # test_dtm = vect_title.transform(test) # without tfidf test_dtm = tfidf_vect_title.transform(test) # with tfidf status = False for i in test_dtm.toarray()[0]: if (i!=0): status = True break ans = rf_clf.predict(test_dtm.toarray()) ans = mlb.inverse_transform(ans) if (len(ans[0])==0 or status==False): print ("sorry, we can't predict your category!!!") else: ans = le.inverse_transform(ans) print (ans)
# Split Train/Test ############################################################################### (inputs, outputs) = (DATA[FEATS], DATA[['CPT', 'WOP']]) (TRN_X, VAL_X, TRN_Y, VAL_Y) = train_test_split( inputs, outputs, test_size=float(VT_SPLIT), stratify=outputs ) (TRN_L, VAL_L) = [i.shape[0] for i in (TRN_X, VAL_X)] ############################################################################### # Define Model ############################################################################### rf = RandomForestClassifier( n_estimators=TREES, max_depth=DEPTH, criterion='entropy', min_samples_split=5, min_samples_leaf=50, max_features=None, max_leaf_nodes=None, n_jobs=JOB ) clf = MultiOutputClassifier(rf) # K-fold training ------------------------------------------------------------- kScores = cross_val_score(clf, TRN_X, TRN_Y) kScores ############################################################################### # Train Model ############################################################################### clf.fit(TRN_X, TRN_Y) # Predict --------------------------------------------------------------------- PRD_Y = clf.predict(VAL_X) clf.score(VAL_X, VAL_Y)
if 'z' in col: y_train_regr[col] = y_train[col] y_test_regr[col] = y_test[col] else: y_train_clf[col] = y_train[col] y_test_clf[col] = y_test[col] mo_clf = MultiOutputClassifier(rs_clf) # Fit the data to the models print('Fitting data') mo_clf.fit(x_train, y_train_clf) rs_regr.fit(x_train, y_train_regr) # Print the results of the fit on the test data print('Test classification score: %.3f' % mo_clf.score(x_test, y_test_clf)) print('Test regression R2 score: %.3f' % rs_regr.score(x_test, y_test_regr)) # Plot the decision surfaces of the classifier and regressor x = pd.DataFrame(np.linspace(0, 5, 25)) y = pd.DataFrame(np.linspace(0, 5, 25)) # Create a grid to plot our predicted values over surf_x = pd.DataFrame(np.array(np.meshgrid( x, y, )).T.reshape(-1, 2)) surf_z = pd.DataFrame() # Predict a value for each (x, y) pair in the grid
X_test = PCA(n_components=2).fit_transform(X_test) ax2.set_title('Test labels') ax2.scatter(X_test[:, 0], X_test[:, 1], c=np.sum(Y_test * np.array([1, 2, 3, 4, 5]), axis=1)) ax2.set_xlabel('Feature 0 count') forest = RandomForestClassifier(n_estimators=100, random_state=1) decision = DecisionTreeClassifier() # training step multi_target_R = MultiOutputClassifier(forest, n_jobs=-1) result_R = multi_target_R.fit(X, Y) result_R = multi_target_R.predict(X_test) score_R = multi_target_R.score(X_test, Y_test) multi_target_D = MultiOutputClassifier(decision, n_jobs=-1) multi_target_D = multi_target_D.fit(X, Y) result_D = multi_target_D.predict(X_test) score_D = multi_target_D.score(X_test, Y_test) # Plot classification result ax3.scatter(X_test[:, 0], X_test[:, 1], c=np.sum(result_D * np.array([1, 2, 3, 4, 5]), axis=1)) ax3.set_title('Decision Tree labels %0.2f' % score_D) ax3.set_ylabel('Feature 1 count') ax3.set_xlabel('Feature 0 count') X_w_D = [] for i in range(len(result_D)):
import numpy as np import pandas as pd from simulations.irs_v2x_simulation import IRSV2XSimulation from sklearn.multioutput import MultiOutputClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from joblib import dump, load data = pd.read_csv('data_position_simulation.csv') irs_antnum = 256 cols_x = [IRSV2XSimulation.COL_POS_X, IRSV2XSimulation.COL_POS_Y] # IRSV2XSimulation.COL_POS_Z, # IRSV2XSimulation.COL_SPEED] cols_y = [] for n in range(irs_antnum): cols_y.append(IRSV2XSimulation.COL_PHASE + str(n)) X = data[cols_x].to_numpy() Y = data[cols_y].to_numpy() # X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42) forest = RandomForestClassifier(n_estimators=100, random_state=1) classifier = MultiOutputClassifier(forest, n_jobs=-1) classifier.fit(X[0:100, :], Y[0:100, :]) dump(classifier, 'classifier.joblib') print(classifier.score(X, Y))
y_train = df_train[categories].values mltout_clf_svm = MultiOutputClassifier(svm.SVC(), n_jobs=-1).fit(x_train, y_train) mltout_clf_per = MultiOutputClassifier(Perceptron(), n_jobs=-1).fit(x_train, y_train) data_test = arff.loadarff('scene-test.arff') df_test = pd.DataFrame(data_test[0]) df_test.replace(converte, inplace=True) x_test = df_test.drop(categories, axis=1) y_test = df_test[categories] # Gera o array com as predições para cada exemplo y_pred_svm = mltout_clf_svm.predict(x_test) # Acurácia do método print("Acurácia:", mltout_clf_svm.score(x_test, y_test)) # Matriz de confusão para cada rótulo plot_confusion_matrix(y_test, y_pred_svm, classes=[0, 1], title=categories) # Gera o array com as predições para cada exemplo y_pred_per = mltout_clf_per.predict(x_test) # Acurácia do método print("Acurácia: ", mltout_clf_per.score(x_test, y_test)) # Matriz de confusão para cada rótulo plot_confusion_matrix(y_test, y_pred_svm, classes=[0, 1], title=categories)
bestc = [] c = 0.00001 while c < 10003: print("*") biii_model = MultiOutputClassifier(LinearSVC(penalty='l1',C=c,dual=False),n_jobs=-1) print(c) score = cross_val_score(biii_model,StrainX,trainy,cv=10,n_jobs=-1) print(score.mean()) bestc.append(score.mean()) c = c * 10 biii_model = MultiOutputClassifier(LinearSVC(penalty='l1',C=1,dual=False),n_jobs=-1) biii_model.fit(StrainX,trainy) predl1 = biii_model.predict(StestX) print("Test Accuracy is",biii_model.score(StestX,testy)) print("Hamming Loss",hamloss(testy, predl1)) hamlossL(testy, predl1) print("Exact Match Score",exactmatch(testy, predl1)) exactmatchL(testy, predl1) bestd = [] c = 0.00001 while c < 10003: print("*") biv_model = MultiOutputClassifier(LinearSVC(penalty='l1',C=c,dual=False,class_weight='balanced'),n_jobs=-1) print(c) score = cross_val_score(biv_model,StrainX,trainy,cv=10,n_jobs=-1)
question_tfidftransformed_acp = question_acp.transform( question_tfidftransformed) answer_tfidftransformed_acp = answer_acp.transform(answer_tfidftransformed) X_transformed = sp.hstack( [question_tfidftransformed_acp, answer_tfidftransformed_acp, X_category]) X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_transformed, test_size=0.15) dtc = DecisionTreeClassifier(max_depth=5) dtc_multi = MultiOutputClassifier(dtc, n_jobs=-1) dtc_multi.fit(X_train, y_train) dtc_multi.score(X_test, y_test) def transform_split_score(X, y): if isinstance(X, list): X_transformed = sp.hstack( X[text_var].apply(lambda col: vectorizer.fit_transform(col))) X_transformed = vectorizer.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.15) dtc_multi.fit(X_train, y_train) return dtc_multi.score(X_test, y_test) # tree interpretation
def main(): # Script argument parsing parser = argparse.ArgumentParser( description= 'Homework 03 - Machine learning a.a. 2018/19 - Predict missing values', epilog=' coded by: Emanuele Palombo') parser.add_argument('dataset_name', metavar='DATASET', type=str, nargs='?', default=__default_ts_name, help='{} (default {}) - dataset name'.format( list(__ts_opts.keys()), __default_ts_name)) parser.add_argument( '--test-size', '-t', dest='test_size', action='store', metavar='TEST_SIZE', type=float, default=__default_test_size, help='[0-1] (default {}) - splitting size of TestSet'.format( __default_test_size)) parser.add_argument( '--question-marks-ts', '-q', dest='qm_repeted_ts', action='store', type=int, default=__default_question_mark_count_repeated, help= '{{0,1,2...}} (default {}) - (this value * {} * samples) added to TrainingSet' .format(__default_question_mark_count_repeated, __default_question_mark_count)) parser.add_argument( '--no-split', '-s', dest='no_split', action='store_true', default=__default_no_split, help='(default {}) - keep whole DataSet for training'.format( __default_no_split)) parser.add_argument('--img-tag', '-i', dest='img_tag', action='store', type=str, default='', help='string - add arbitrary string to saved images') parser.add_argument( '--verbose', '-v', dest='verbosity', action='count', default=__default_training_verbosity, help='add more verbosity to output (repeat it to increase)') args = parser.parse_args() if args.dataset_name not in __ts_opts: print('ERROR: Choose correct DataSet!\n') parser.print_help() exit(1) trainingset_selected_name = args.dataset_name test_size = args.test_size qm_repeted_ts = args.qm_repeted_ts dataset_no_split = args.no_split training_verbosity = args.verbosity img_tag = args.img_tag running_id = id_generator() ts_selected_opts = __ts_opts[trainingset_selected_name] # End script argument parsing print('\nDataSet selected: ' + ts_selected_opts['url']) # read dataset to pandas dataframe dataset = pd.read_csv(ts_selected_opts['url'], names=ts_selected_opts['columns']) if training_verbosity >= 1: print('\nFirst five rows of DataSet:\n') print(dataset.head()) print('\nDataSet Length: {}'.format(len(dataset))) # DataSet Manipulation # remove row with question marks (this avoid to have '?' on the output) dataset = dataset[~(dataset.astype(str) == '?').any(1)] # strip out (remove) the "real output" (y) dataset = dataset.iloc[ts_selected_opts['x_slice'][0], ts_selected_opts['x_slice'][1]] # Different approach to value conversion # convert all column to int (str => int) # dataset = dataset.apply(lambda x: pd.factorize(x)[0] + 1) # convert all columns to int dataset = dataset.astype(int) # dataSet Information features_count = len(dataset.columns) features_values = ds_features_values(dataset) # copy input features to output (columns * 2) for column in dataset.columns: dataset['y_' + column] = dataset[column] # Split DataSet training_set, test_set = train_test_split( dataset, test_size=test_size, random_state=__default_train_test_split_random_state) # check feature values between TrainingSet and TestSet # it's important avoid more value on TestSet (ie. error on log_loss for mismatch in predict_proba size) if not check_labels_split(features_count, training_set, test_set): exit(1) # Concat (add row) TrainingSet and TestSet # in this case model could see all sample (included queries without '?') if dataset_no_split: training_set = pd.concat([training_set, test_set], axis=0) print('\nTraining over the whole DataSet') else: print('\nSplit DataSet in TrainingSet and TestSet (test size: {})'. format(test_size)) # add (append) question mark # append qm_count rows, with 1 to qm_count '?' qm_count = int(ts_selected_opts['question_mark_count']) for i in range(qm_repeted_ts): for value_count in range(1, qm_count + 1): training_set = ds_mod_with_value(training_set, value_count, features_count, True) if training_verbosity >= 1: print( '{} Added {} question mark (?) to TrainingSet for each sample' .format(i, value_count)) # Shuffle TrainingSet training_set = training_set.sample(frac=1) if training_verbosity >= 1: print('\nManipulated TrainingSet:\n') print(training_set.head()) print('\nTrainingSet Length: {}'.format(len(training_set))) # TrainingSet: input X (features) and Output y ("mirrored" features)) x_train = training_set.iloc[:, 0:features_count] y_train = training_set.iloc[:, features_count:] # TestSet: input X (features) and Output y ("mirrored" features)) x_test = test_set.iloc[:, 0:features_count] y_test = test_set.iloc[:, features_count:] if training_verbosity >= 2: print('\nInput train:\n {}'.format(x_train.head())) print('\nOutput train:\n {}'.format(y_train.head())) print('\nInput test:\n {}'.format(x_test.head())) print('\nOutput test:\n {}'.format(y_test.head())) x_train = x_train.values y_train = y_train.values y_test = y_test.values # oneHot encoding (characteristic vector) # passing features_values without None force OneHotEncoder to transform None to null vector one_hot_encoder = OneHotEncoder(categories=features_values, handle_unknown='ignore') one_hot_encoder.fit(x_train) x_train_encoded = one_hot_encoder.transform(x_train).toarray() if training_verbosity >= 2: print('\nOneHotEncoding...\nexample: {} => {}'.format( x_train[0], x_train_encoded[0])) # store all results/metrics for each model/classifier results = {} for classifier_name in __deafult_model_classifier: filename = 'model_{}_{}.sav'.format(trainingset_selected_name, classifier_name) if os.path.isfile(filename): # load module already trained multi_output_classifier = joblib.load(filename) print( '\n### Model {} loaded by file: {}\nImportant: remove the file to re-train the model!' .format(classifier_name, filename)) else: n_jobs = None model_verbosity = True if training_verbosity >= 3 else False if classifier_name == 'MLP': classifier = MLPClassifier(hidden_layer_sizes=ts_selected_opts[ 'mlp_hidden_layers_sizes'], max_iter=1000, verbose=model_verbosity) elif classifier_name == 'KNN': n_jobs = None classifier = KNeighborsClassifier( n_neighbors=ts_selected_opts['knn_k']) elif classifier_name == 'SVM': classifier = SVC(gamma='scale', decision_function_shape='ovo', probability=True, verbose=model_verbosity) elif classifier_name == 'RandomForest': classifier = RandomForestClassifier( n_estimators=ts_selected_opts['random_forest_estimator'], verbose=model_verbosity) print('\n### Init and training the model: {}'.format( classifier_name)) # init MultiOutput for classifier multi_output_classifier = MultiOutputClassifier(classifier, n_jobs=n_jobs) multi_output_classifier.fit(x_train_encoded, y_train) # save the model to disk joblib.dump(multi_output_classifier, filename) results[classifier_name] = collections.defaultdict(list) metris_result = results[classifier_name] # create input test (query) with different number of '?' for query_count_question_mark in range( ts_selected_opts['question_mark_count'] + 1): print('\n## Add {} questions mark to input test (query)'.format( query_count_question_mark)) # modify (in place) input test with question marks x_test_with_qm = ds_mod_with_value( x_test.copy(), value_count=query_count_question_mark, append=False) if training_verbosity >= 2: print('\nInput test (query):\n {}'.format( pd.DataFrame(data=x_test_with_qm).head())) # encode the input test x_test_encoded = one_hot_encoder.transform( x_test_with_qm).toarray() # compute output prediction and probability y_pred = multi_output_classifier.predict(x_test_encoded) y_pred_proba = multi_output_classifier.predict_proba( x_test_encoded) # precision on whole output score = multi_output_classifier.score(x_test_encoded, y_test) # the Hamming loss corresponds to the Hamming distance between y_test and y_pred hamming_loss = np.sum(np.not_equal(y_test, y_pred)) / float( y_test.size) # compute y_test and y_pred how if the out was only the query question marks y_test_reduced, y_pred_reduced = reduce_y_to_qm( x_test_with_qm, y_test, y_pred) # write y_pred_proba to file (csv) write_pred_proba( y_pred_proba, '{}{}-{}-q{}-{}{}.csv'.format(__default_csv_path, trainingset_selected_name, classifier_name, query_count_question_mark, running_id, img_tag)) print('\nMetrics:') print(' {:<30} | {:^10} | {:>10}'.format('features', 'accuracy', 'log loss')) print('-' * (30 + 10 + 10 + 7)) log_loss_avg = 0 # for each output column => compute accuracy and log_loss for feature_index in range(y_test.shape[1]): y_test_column = y_test[:, feature_index] y_pred_column = y_pred[:, feature_index] accuracy = accuracy_score(y_test_column, y_pred_column) # note: for avoid error here was implemented check_labels_split() log_loss_value = log_loss( y_test_column, y_pred_proba[feature_index], labels=features_values[feature_index]) print(' {:<30} | {:^10.4f} | {:>10.4f}'.format( test_set.columns[feature_index], accuracy, log_loss_value)) log_loss_avg += log_loss_value metris_result['accuracy_' + str(feature_index)].append(accuracy) metris_result['log_loss_' + str(feature_index)].append(log_loss_value) print('\nVirtual reduced output:') # for each output reduced (only question marks) => compute accuracy for index in range(query_count_question_mark): accuracy = accuracy_score(y_test_reduced[:, index], y_pred_reduced[:, index]) print(' accuracy {}: {:>10.4f}'.format(index, accuracy)) metris_result['accuracy_reduced_' + str(index)].append(accuracy) print('\nAll output:') print(' accuracy: {:>10.4f}'.format(score)) print(' log_loss avg: {:>10.4f}'.format(log_loss_avg / y_test.shape[1])) print(' hamming loss: {:>10.4f}'.format(hamming_loss)) metris_result['accuracy'].append(score) metris_result['log_loss_avg'].append(log_loss_avg / y_test.shape[1]) metris_result['hamming_loss'].append(hamming_loss) # GRAPH PLOT per model/classifier plot_line_graph(range(ts_selected_opts['question_mark_count'] + 1), [ results[classifier_name]['accuracy'], results[classifier_name]['log_loss_avg'], results[classifier_name]['hamming_loss'] ], labels=['accuracy', 'log loss avg', 'hamming loss'], fmt=['bo-', 'ro-', 'yo-'], title=classifier_name, xlabel='Number of Question Marks in the query', ymax=1) if __default_save_img: plt.savefig('{}{}-{}-{}{}.png'.format(__default_imgs_path, trainingset_selected_name, classifier_name, running_id, img_tag), dpi=200) # create list of list of accuracy x feature accuracy_lst = [ 'accuracy_' + str(index) for index in range(features_count) ] accuracy_lst = [ results[classifier_name][accuracy_key] for accuracy_key in accuracy_lst ] plot_line_graph(range(ts_selected_opts['question_mark_count'] + 1), [results[classifier_name]['accuracy']] + accuracy_lst, fmt=['bo-'] + ['g.--'] * len(accuracy_lst), title=classifier_name + ': whole accuracy and those by features', xlabel='Number of Question Marks in the query', ylabel='accuracy', ymax=1) if __default_save_img: plt.savefig('{}{}-{}-accuracy-{}{}.png'.format( __default_imgs_path, trainingset_selected_name, classifier_name, running_id, img_tag), dpi=200) # create list of list of accuracy_reduced x feature (adding 0 in front when needed) accuracy_reduced_lst = [ 'accuracy_reduced_' + str(index) for index in range(ts_selected_opts['question_mark_count']) ] accuracy_reduced_lst = [ results[classifier_name][accuracy_reduced] for accuracy_reduced in accuracy_reduced_lst ] accuracy_reduced_lst = [[None] * (ts_selected_opts['question_mark_count'] - len(accuracy_reduced) + 1) + accuracy_reduced for accuracy_reduced in accuracy_reduced_lst] plot_line_graph( range(ts_selected_opts['question_mark_count'] + 1), [results[classifier_name]['accuracy']] + accuracy_reduced_lst, fmt=['bo-'] + ['m.--'] * len(accuracy_reduced_lst), title=classifier_name + ': whole accuracy and the virtual accuracies by features', xlabel='Number of Question Marks in the query', ylabel='accuracy', ymax=1) if __default_save_img: plt.savefig('{}{}-{}-accuracy-reduced-{}{}.png'.format( __default_imgs_path, trainingset_selected_name, classifier_name, running_id, img_tag), dpi=200) # create list of list of log_loss x feature log_loss_lst = [ 'log_loss_' + str(index) for index in range(features_count) ] log_loss_lst = [ results[classifier_name][log_loss_key] for log_loss_key in log_loss_lst ] plot_line_graph( range(ts_selected_opts['question_mark_count'] + 1), [results[classifier_name]['log_loss_avg']] + log_loss_lst, fmt=['ro-'] + ['c.--'] * len(log_loss_lst), title=classifier_name + ': average log loss and those by features', xlabel='Number of Question Marks in the query', ylabel='log loss') if __default_save_img: plt.savefig('{}{}-{}-log-loss-{}{}.png'.format( __default_imgs_path, trainingset_selected_name, classifier_name, running_id, img_tag), dpi=200) metrics_by_classifier = [ results[classifier][metric] for classifier in __deafult_model_classifier for metric in ['accuracy', 'log_loss_avg', 'hamming_loss'] ] label_by_classifier = [ classifier + ' ' + metric for classifier in __deafult_model_classifier for metric in ['accuracy', 'log_loss_avg', 'hamming_loss'] ] fmt_lst = [ style.replace('0', character) for character in ['o', '^', 'v', '<', '>', '.', ',', '+', 'x'] for style in ['b0-', 'r0-', 'y0-'] ] # GRAPH PLOT comparing model/classifier plot_line_graph(range(ts_selected_opts['question_mark_count'] + 1), metrics_by_classifier, labels=label_by_classifier, fmt=fmt_lst, title='Compare all model', xlabel='Number of Question Marks in the query', ylabel='', ymax=1) if __default_save_img: plt.savefig('{}{}-comparing-{}{}.png'.format( __default_imgs_path, trainingset_selected_name, running_id, img_tag), dpi=200) if not __default_save_img: plt.show()
class Igel(object): """ Igel is the base model to use the fit, evaluate and predict functions of the sklearn library """ available_commands = ('fit', 'evaluate', 'predict', 'experiment') supported_types = ('regression', 'classification', 'clustering') results_path = configs.get('results_path') # path to the results folder default_model_path = configs.get( 'default_model_path') # path to the pre-fitted model description_file = configs.get( 'description_file') # path to the description.json file evaluation_file = configs.get( 'evaluation_file') # path to the evaluation.json file prediction_file = configs.get( 'prediction_file') # path to the predictions.csv default_dataset_props = configs.get( 'dataset_props' ) # dataset props that can be changed from the yaml file default_model_props = configs.get( 'model_props') # model props that can be changed from the yaml file model = None def __init__(self, **cli_args): logger.info(f"Entered CLI args: {cli_args}") logger.info(f"Executing command: {cli_args.get('cmd')} ...") self.data_path: str = cli_args.get('data_path') # path to the dataset logger.info(f"reading data from {self.data_path}") self.command = cli_args.get('cmd', None) if not self.command or self.command not in self.available_commands: raise Exception(f"You must enter a valid command.\n" f"available commands: {self.available_commands}") if self.command == "fit": self.yml_path = cli_args.get('yaml_path') file_ext = self.yml_path.split('.')[-1] logger.info(f"You passed the configurations as a {file_ext} file.") self.yaml_configs = read_yaml( self.yml_path) if file_ext == 'yaml' else read_json( self.yml_path) logger.info(f"your chosen configuration: {self.yaml_configs}") # dataset options given by the user self.dataset_props: dict = self.yaml_configs.get( 'dataset', self.default_dataset_props) # model options given by the user self.model_props: dict = self.yaml_configs.get( 'model', self.default_model_props) # list of target(s) to predict self.target: list = self.yaml_configs.get('target') self.model_type: str = self.model_props.get('type') logger.info(f"dataset_props: {self.dataset_props} \n" f"model_props: {self.model_props} \n " f"target: {self.target} \n") # if entered command is evaluate or predict, then the pre-fitted model needs to be loaded and used else: self.model_path = cli_args.get('model_path', self.default_model_path) logger.info(f"path of the pre-fitted model => {self.model_path}") # load description file to read stored training parameters with open(self.description_file, 'r') as f: dic = json.load(f) self.target: list = dic.get( "target") # target to predict as a list self.model_type: str = dic.get( "type" ) # type of the model -> regression or classification self.dataset_props: dict = dic.get( 'dataset_props') # dataset props entered while fitting getattr(self, self.command)() def _create_model(self, **kwargs): """ fetch a model depending on the provided type and algorithm by the user and return it @return: class of the chosen model """ model_type: str = self.model_props.get('type') model_algorithm: str = self.model_props.get('algorithm') use_cv = self.model_props.get('use_cv_estimator', None) model_args = None if not model_type or not model_algorithm: raise Exception(f"model_type and algorithm cannot be None") algorithms: dict = models_dict.get( model_type) # extract all algorithms as a dictionary model = algorithms.get( model_algorithm) # extract model class depending on the algorithm logger.info( f"Solving a {model_type} problem using ===> {model_algorithm}") if not model: raise Exception("Model not found in the algorithms list") else: model_props_args = self.model_props.get('arguments', None) if model_props_args and type(model_props_args) == dict: model_args = model_props_args elif not model_props_args or model_props_args.lower() == "default": model_args = None if use_cv: model_class = model.get('cv_class', None) if model_class: logger.info( f"cross validation estimator detected. " f"Switch to the CV version of the {model_algorithm} algorithm" ) else: logger.info( f"No CV class found for the {model_algorithm} algorithm" ) else: model_class = model.get('class') logger.info(f"model arguments: \n" f"{self.model_props.get('arguments')}") model = model_class(**kwargs) if not model_args else model_class( **model_args) return model, model_args def _save_model(self, model): """ save the model to a binary file @param model: model to save @return: bool """ try: if not os.path.exists(self.results_path): logger.info( f"creating model_results folder to save results...\n" f"path of the results folder: {self.results_path}") os.mkdir(self.results_path) else: logger.info(f"Folder {self.results_path} already exists") logger.warning( f"data in the {self.results_path} folder will be overridden. If you don't " f"want this, then move the current {self.results_path} to another path" ) except OSError: logger.exception( f"Creating the directory {self.results_path} failed ") else: logger.info( f"Successfully created the directory in {self.results_path} ") pickle.dump(model, open(self.default_model_path, 'wb')) return True def _load_model(self, f: str = ''): """ load a saved model from file @param f: path to model @return: loaded model """ try: if not f: logger.info(f"result path: {self.results_path} ") logger.info(f"loading model form {self.default_model_path} ") model = pickle.load(open(self.default_model_path, 'rb')) else: logger.info(f"loading from {f}") model = pickle.load(open(f, 'rb')) return model except FileNotFoundError: logger.error(f"File not found in {self.default_model_path} ") def _prepare_fit_data(self): return self._process_data(target='fit') def _prepare_eval_data(self): return self._process_data(target='evaluate') def _process_data(self, target='fit'): """ read and return data as x and y @return: list of separate x and y """ assert isinstance(self.target, list), "provide target(s) as a list in the yaml file" if self.model_type != "clustering": assert len( self.target) > 0, "please provide at least a target to predict" try: read_data_options = self.dataset_props.get('read_data_options', None) dataset = pd.read_csv( self.data_path) if not read_data_options else pd.read_csv( self.data_path, **read_data_options) logger.info(f"dataset shape: {dataset.shape}") attributes = list(dataset.columns) logger.info(f"dataset attributes: {attributes}") # handle missing values in the dataset preprocess_props = self.dataset_props.get('preprocess', None) if preprocess_props: # handle encoding encoding = preprocess_props.get('encoding') if encoding: encoding_type = encoding.get('type', None) column = encoding.get('column', None) if column in attributes: dataset, classes_map = encode( df=dataset, encoding_type=encoding_type.lower(), column=column) if classes_map: self.dataset_props[ 'label_encoding_classes'] = classes_map logger.info( f"adding classes_map to dataset props: \n{classes_map}" ) logger.info( f"shape of the dataset after encoding => {dataset.shape}" ) # preprocessing strategy: mean, median, mode etc.. strategy = preprocess_props.get('missing_values') if strategy: dataset = handle_missing_values(dataset, strategy=strategy) logger.info( f"shape of the dataset after handling missing values => {dataset.shape}" ) if target == 'predict' or target == 'fit_cluster': x = _reshape(dataset.to_numpy()) if not preprocess_props: return x scaling_props = preprocess_props.get('scale', None) if not scaling_props: return x else: scaling_method = scaling_props.get('method', None) return normalize(x, method=scaling_method) if any(col not in attributes for col in self.target): raise Exception( "chosen target(s) to predict must exist in the dataset") y = pd.concat([dataset.pop(x) for x in self.target], axis=1) x = _reshape(dataset.to_numpy()) y = _reshape(y.to_numpy()) logger.info(f"y shape: {y.shape} and x shape: {x.shape}") # handle data scaling if preprocess_props: scaling_props = preprocess_props.get('scale', None) if scaling_props: scaling_method = scaling_props.get('method', None) scaling_target = scaling_props.get('target', None) if scaling_target == 'all': x = normalize(x, method=scaling_method) y = normalize(y, method=scaling_method) elif scaling_target == 'inputs': x = normalize(x, method=scaling_method) elif scaling_target == 'outputs': y = normalize(y, method=scaling_method) if target == 'evaluate': return x, y split_options = self.dataset_props.get('split', None) if not split_options: return x, y, None, None test_size = split_options.get('test_size') shuffle = split_options.get('shuffle') stratify = split_options.get('stratify') x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=test_size, shuffle=shuffle, stratify=None if not stratify or stratify.lower() == "default" else stratify) return x_train, y_train, x_test, y_test except Exception as e: logger.exception( f"error occured while preparing the data: {e.args}") def _prepare_clustering_data(self): """ preprocess data for the clustering algorithm """ return self._process_data(target='fit_cluster') def _prepare_predict_data(self): """ preprocess predict data to get similar data to the one used when training the model """ return self._process_data(target='predict') def get_evaluation(self, model, x_test, y_true, y_pred, **kwargs): res = None try: res = evaluate_model(model_type=self.model_type, model=model, x_test=x_test, y_pred=y_pred, y_true=y_true, get_score_only=False, **kwargs) except Exception as e: res = evaluate_model(model_type=self.model_type, model=model, x_test=x_test, y_pred=y_pred, y_true=y_true, get_score_only=True, **kwargs) return res def fit(self, **kwargs): """ fit a machine learning model and save it to a file along with a description.json file @return: None """ x_train = None x_test = None y_train = None y_test = None cv_results = None eval_results = None cv_params = None if self.model_type == 'clustering': x_train = self._prepare_clustering_data() else: x_train, y_train, x_test, y_test = self._prepare_fit_data() self.model, model_args = self._create_model(**kwargs) logger.info( f"executing a {self.model.__class__.__name__} algorithm...") # convert to multioutput if there is more than one target to predict: if self.model_type != 'clustering' and len(self.target) > 1: logger.info( f"predicting multiple targets detected. Hence, the model will be automatically " f"converted to a multioutput model") self.model = MultiOutputClassifier(self.model) \ if self.model_type == 'classification' else MultiOutputRegressor(self.model) if self.model_type != 'clustering': cv_params = self.model_props.get('cross_validate', None) if not cv_params: logger.info(f"cross validation is not provided") else: cv_results = cross_validate(estimator=self.model, X=x_train, y=y_train, **cv_params) self.model.fit(x_train, y_train) else: self.model.fit(x_train) saved = self._save_model(self.model) if saved: logger.info( f"model saved successfully and can be found in the {self.results_path} folder" ) if self.model_type == 'clustering': eval_results = self.model.score(x_train) else: if x_test is None: logger.info( f"no split options was provided. training score will be calculated" ) eval_results = self.model.score(x_train, y_train) else: logger.info( f"split option detected. The performance will be automatically evaluated " f"using the test data portion") y_pred = self.model.predict(x_test) eval_results = self.get_evaluation(model=self.model, x_test=x_test, y_true=y_test, y_pred=y_pred, **kwargs) fit_description = { "model": self.model.__class__.__name__, "arguments": model_args if model_args else "default", "type": self.model_props['type'], "algorithm": self.model_props['algorithm'], "dataset_props": self.dataset_props, "model_props": self.model_props, "data_path": self.data_path, "train_data_shape": x_train.shape, "test_data_shape": None if x_test is None else x_test.shape, "train_data_size": x_train.shape[0], "test_data_size": None if x_test is None else x_test.shape[0], "results_path": str(self.results_path), "model_path": str(self.default_model_path), "target": None if self.model_type == 'clustering' else self.target, "results_on_test_data": eval_results } if self.model_type == 'clustering': clustering_res = { "cluster_centers": self.model.cluster_centers_, "cluster_labels": self.model.labels_ } fit_description['clustering_results'] = clustering_res if cv_params: cv_res = { "fit_time": cv_results['fit_time'].tolist(), "score_time": cv_results['score_time'].tolist(), "test_score": cv_results['test_score'].tolist() } fit_description['cross_validation_params'] = cv_params fit_description['cross_validation_results'] = cv_res try: logger.info(f"saving fit description to {self.description_file}") with open(self.description_file, 'w', encoding='utf-8') as f: json.dump(fit_description, f, ensure_ascii=False, indent=4) except Exception as e: logger.exception( f"Error while storing the fit description file: {e}") def evaluate(self, **kwargs): """ evaluate a pre-fitted model and save results to a evaluation.json @return: None """ x_val = None y_true = None eval_results = None try: model = self._load_model() if self.model_type != 'clustering': x_val, y_true = self._prepare_eval_data() y_pred = model.predict(x_val) eval_results = self.get_evaluation(model=model, x_test=x_val, y_true=y_true, y_pred=y_pred, **kwargs) else: x_val = self._prepare_clustering_data() y_pred = model.predict(x_val) eval_results = model.score(x_val, y_pred) logger.info(f"saving fit description to {self.evaluation_file}") with open(self.evaluation_file, 'w', encoding='utf-8') as f: json.dump(eval_results, f, ensure_ascii=False, indent=4) except Exception as e: logger.exception(f"error occured during evaluation: {e}") def predict(self): """ use a pre-fitted model to make predictions and save them as csv @return: None """ try: model = self._load_model(f=self.model_path) x_val = self._prepare_predict_data( ) # the same is used for clustering y_pred = model.predict(x_val) y_pred = _reshape(y_pred) logger.info( f"predictions shape: {y_pred.shape} | shape len: {len(y_pred.shape)}" ) logger.info(f"predict on targets: {self.target}") df_pred = pd.DataFrame.from_dict({ self.target[i]: y_pred[:, i] if len(y_pred.shape) > 1 else y_pred for i in range(len(self.target)) }) logger.info(f"saving the predictions to {self.prediction_file}") df_pred.to_csv(self.prediction_file) except Exception as e: logger.exception(f"Error while preparing predictions: {e}") @staticmethod def create_init_mock_file(model_type=None, model_name=None, target=None, *args, **kwargs): path = configs.get('init_file_path', None) if not path: raise Exception("You need to provide a path for the init file") dataset_props = Igel.default_dataset_props model_props = Igel.default_model_props if model_type: logger.info(f"user selected model type = {model_type}") model_props['type'] = model_type if model_name: logger.info(f"user selected algorithm = {model_name}") model_props['algorithm'] = model_name logger.info(f"initalizing a default igel.yaml in {path}") default_data = { "dataset": dataset_props, "model": model_props, "target": ['provide your target(s) here'] if not target else [tg for tg in target.split()] } created = create_yaml(default_data, path) if created: logger.info( f"a default igel.yaml is created for you in {path}. " f"you just need to overwrite the values to meet your expectations" ) else: logger.warning( f"something went wrong while initializing a default file")
datasetY.append(indices)#np.array(indices).astype('int')) mlb = MultiLabelBinarizer()#classes=len(radionuclides)) datasetY = mlb.fit_transform(datasetY) #datasetX = StandardScaler().fit_transform(datasetX) X_train, X_test, y_train, y_test = \ train_test_split(datasetX, datasetY, test_size=.4, random_state=42) #print(y_train) #print(type(y_train)) #y_train = y_train.astype('int') #y_test = y_test.astype('int') #print(X_train) #print(y_train) classifier.fit(X_train, y_train) score = classifier.score(X_test, y_test) print(score) # predict inv = ag.UnstablesInventory(data=[ (db.getzai(radionuclides[2]), ACTIVITY), (db.getzai(radionuclides[0]), ACTIVITY), (db.getzai(radionuclides[5]), ACTIVITY), (db.getzai(radionuclides[3]), ACTIVITY) ]) hist, _ = lc(inv, spectype=SPECTYPE) print(classifier.predict([[1 if bin > 0 else 0 for bin in hist ]]))