Пример #1
0
def test_multi_output_exceptions():
    # NotFittedError when fit is not done but score, predict and
    # and predict_proba are called
    moc = MultiOutputClassifier(LinearSVC(random_state=0))

    with pytest.raises(NotFittedError):
        moc.predict(y)

    with pytest.raises(NotFittedError):
        moc.predict_proba

    with pytest.raises(NotFittedError):
        moc.score(X, y)

    # ValueError when number of outputs is different
    # for fit and score
    y_new = np.column_stack((y1, y2))
    moc.fit(X, y)
    with pytest.raises(ValueError):
        moc.score(X, y_new)

    # ValueError when y is continuous
    msg = "Unknown label type"
    with pytest.raises(ValueError, match=msg):
        moc.fit(X, X[:, 1])
def obtain_optimum_parameters(hidden_layers, neurons_per_layer, times=3):
    """
    Se calcula la precisión de distintos perceptrones multicapas. Para cada número de capas y  número de
    neuronas por capa se ajusta un modelo distinto. Todas las capas de un MLP tienen igual cantidad de neuronas.
    Los resultados de los modelos se almacenan en un archivo json.
    :param hidden_layers:       lista número de capas ocultas
    :param times                número de veces que se corre un mismo mlp
    :param neurons_per_layer:   lista de número de neuronas por capa
    """
    logging.info("Calculando el número de capas ocultas y neuronas óptimo...")

    # Dataframe que contendrá los resultados
    df = pd.DataFrame(index=neurons_per_layer, columns=hidden_layers)

    for layers in hidden_layers:
        logging.info("MLP's con {} capas ...".format(layers))

        for neurons in neurons_per_layer:
            hidden_layer_sizes = [neurons for _ in range(layers)]

            logging.info(
                'Entrenándose MLP con {} capas y {} neuronas por capa oculta'.
                format(layers, neurons))

            # Se entrena el modelo y se obtiene un promedio del score
            score = 0
            for _ in range(times):
                mlp = MLPClassifier(activation='logistic',
                                    solver='lbfgs',
                                    hidden_layer_sizes=hidden_layer_sizes,
                                    max_iter=1000)

                multi_mlp = MultiOutputClassifier(mlp, n_jobs=None)
                multi_mlp.fit(
                    train_tfidf, train_data[[
                        'arriendo', 'profesional', 'tipo_construccion_id'
                    ]])

                score += multi_mlp.score(
                    test_tfidf, test_data[[
                        'arriendo', 'profesional', 'tipo_construccion_id'
                    ]])
                logging.info("Score acumulado: {}".format(score))

            df.loc[neurons, layers] = score / times

            print('Precisión: {}'.format(df.loc[neurons, layers]))

    # Se almacena el dataframe en un archivo json
    df.to_json('neurons_per_layer.json', orient='columns')

    # Gráfico
    df.rename(lambda x: '{} capas ocultas'.format(x),
              axis='columns',
              inplace=True)

    df.plot.line(marker='o')
    plt.xlabel("n° neuronas en cada capa oculta")
    plt.ylabel("precisión")
    plt.show()
Пример #3
0
def logistic_regression(
    dataframe: pandas.DataFrame,
    inputs: Iterable[str],
    outputs: Iterable[str],
) -> LogisticRegressionResult:
    """Extract the dataset variables columns and fit them in a logistic regression model.

    Arguments:
        dataset: the dataset to run the logistic regression against
        inputs: list of input variable names
        outputs: list of output variable names

    Returns:
        A result that contains the values of the coefiecients and intercepts
    """
    x = dataframe[inputs]
    y = dataframe[outputs].copy()
    encoders = [LabelEncoder() for _ in outputs]
    for encoder, output in zip(encoders, outputs):
        y[output] = encoder.fit_transform(y[output])

    reg = MultiOutputClassifier(LogisticRegression(max_iter=1000)).fit(x, y)  # type: ignore
    outcomes = [encoder.classes_.tolist() for encoder in encoders]
    coefs = [estimator.coef_.tolist() for estimator in reg.estimators_]
    intercepts = [estimator.intercept_.tolist() for estimator in reg.estimators_]
    score = float(reg.score(x, y))

    return LogisticRegressionResult(
        outcomes=outcomes,
        coefs=coefs,
        intercepts=intercepts,
        score=score,
    )
Пример #4
0
class GOClassifier:
    def __init__(self, X, y, random_seed=11, test_size=0.25, *args, **kwargs):
        ind = np.arange(X.shape[0])
        np.random.seed(random_seed)
        np.random.shuffle(ind)
        self.X = X[ind]
        self.y = y[ind]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_seed)
        self.random_seed = random_seed
        self.args = args
        self.kwargs = kwargs
        self.clf = None

    def fit(self, X=None, y=None):
        X_ = X if X is not None else self.X_train
        y_ = y if y is not None else self.y_train
        self.clf = MultiOutputClassifier(
            SGDClassifier(alpha=0.0001,
                          max_iter=1000,
                          tol=1e-3,
                          random_state=self.random_seed,
                          *self.args,
                          **self.kwargs))
        self.clf.fit(X_, y_)
        return self.clf

    def predict(self, X=None):
        assert self.clf is not None
        X_ = X if X is not None else self.X
        return self.clf.predict(X_)

    def test_predict(self):
        return self.predict(X=self.X_test)

    def score(self, X, y):
        assert self.clf is not None
        return self.clf.score(X, y)

    def test_score(self):
        assert self.clf is not None
        return self.clf.score(self.X_test, self.y_test)

    def train_score(self):
        assert self.clf is not None
        return self.clf.score(self.X_train, self.y_train)
Пример #5
0
    def SVM(self):
        """
        """
        class_multisvm = MultiOutputClassifier(SVM(kernel='rbf'))

        # Fit on the train data
        class_multisvm.fit(self.X_train, self.y_train)

        # Check the prediction score
        score = class_multisvm.score(self.X_test, self.y_test)
        print("The prediction score on the test data is {:.2f}%".format(score*100))                                                 
Пример #6
0
    def RF(self):
        
        class_multirf = MultiOutputClassifier(RandomForestClassifier(max_depth=30,
                                                                random_state=0))

        # Fit on the train data
        class_multirf.fit(self.X_train, self.y_train)

        # Check the prediction score
        score = class_multirf.score(self.X_test, self.y_test)

        print("The prediction score on the test data is {:.2f}%".format(score*100))
Пример #7
0
 def multioutput_classifier(self, dt_final, x_train1, x_test1, y_train1,
                            y_test1):
     model = MultiOutputClassifier(dt_final)
     model.fit(x_train1,
               y_train1)  # training the model this could take a little time
     accuracy = model.score(
         x_test1, y_test1)  # comparing result with the test part set
     data = {
         'Accuracy': [accuracy],
         'Algorithm': ['DecisionTreeClassifier']
     }
     algorithm_output = pd.DataFrame(data)
     print("===============================")
     print("Training Accuracy Using Multi label during train:")
     print(algorithm_output)
     return model
Пример #8
0
def KNN(X_train, x_test, y_train, y_test):
    knn = KNeighborsClassifier(algorithm='auto',
                               metric='minkowski',
                               metric_params=None,
                               n_jobs=-1,
                               n_neighbors=147,
                               p=2,
                               weights='distance')
    print("poopf")
    knn.fit(X_train, y_train)
    classifier = MultiOutputClassifier(knn, n_jobs=-1)
    classifier.fit(X_train, y_train)
    y_predict = (classifier.predict_proba(x_test))
    output = np.zeros((1967, 147))  #2597
    for x in range(1967):
        for y in range(147):
            output[x][y] = y_predict[y][x][1]
    # print(output)
    # np.savetxt("sub.csv", output, delimiter=",")
    print(classifier.score(output, y_test))
Пример #9
0
x = np.load('./data/x_data.npy')
y = np.load('./data/y_data.npy')
x_pred = np.load('./data/x_pred.npy')

print("x.shape :", x.shape)
print("y.shape :", y.shape)
print("x_pred.shape :", x_pred.shape)

x = x.reshape(x.shape[0], 64 * 64 * 3)

x_pred = x_pred.reshape(x_pred.shape[0], 64 * 64 * 3)

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=77,
                                                    shuffle=True)

# model = XGBClassifier()
model = MultiOutputClassifier(XGBRFClassifier())

# 3. 훈련
model.fit(x_train, y_train)

# 4. 평가, 예측
acc = model.score(x_test, y_test)

print("acc :", acc)

y_pred = model.predict(x_pred)
Пример #10
0
########################################
if ML_option == "Logistic Regression":
    # Fit the model and predict X_test. Show some analysis.

    try:
        logReg = MultiOutputClassifier(LogisticRegression())
        logReg.fit(X_train, y_train)
        pred = logReg.predict(X_test)
        st.write('Mean Absolute Error (MAE):',
                 round(metrics.mean_absolute_error(y_test, pred), 4))
        st.write('Mean Squared Error (MSE):',
                 round(metrics.mean_squared_error(y_test, pred), 4))
        st.write('Root Mean Squared Error (RMSE):',
                 round(np.sqrt(metrics.mean_squared_error(y_test, pred)), 4))
        st.write('Accuracy of Logistic Regression on training set: ',
                 round(logReg.score(X_train, y_train), 4))
        st.write('Accuracy of Logistic Regression  on test set: ',
                 round(logReg.score(X_test, y_test), 4))

        st.subheader("Classification Report")
        st.text(classification_report(y_test, pred))

        try:
            # Confusion matrix
            plot_confusion_matrix(y_test, pred, figsize=(7, 5), cmap="PuBuGn")
            bottom, top = plt.ylim()
            plt.ylim(bottom + 0.5, top - 0.5)
            st.pyplot()
        except:
            st.write("Confusion matrix do not support multioutput.")
Пример #11
0
from .MailCategorizator import Preprocessor
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

import pickle
if __name__ == '__main__':
    preprocessor = Preprocessor()
    binarizer = MultiLabelBinarizer()
    clf = MultiOutputClassifier(RandomForestClassifier(), n_jobs=-1)

    X = preprocessor.build_tfidf_matrix()
    y = binarizer.fit_transform(preprocessor.get_target())

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

    clf.fit(x_train, y_train)
    print(clf.score(x_test, y_test))

    pickle.dump(clf, '../data/pickles/')
    pickle.dump(preprocessor, '../data/pickles/')
Пример #12
0
class Team:
    def __init__(self, team_name, play_by_play_df):
        self.team = team_name
        self.team_df = play_by_play_df[play_by_play_df['posteam'] == self.team]
        self._generate_lists()

        self.valid_play_dict = {
            'Pass': 0,
            'Run': 1,
            'Punt': 2,
            'Field Goal': 3
        }
        self.valid_play_inv_dict = {
            0: 'Pass',
            1: 'Run',
            2: 'Punt',
            3: 'Field Goal'
        }

        self.X = []
        self.Y = []

    def train_classifier(self, debug_classifier=False):
        self._organize_training_data()
        self._generate_random_forest(debug_classifier)

    def _generate_random_forest(self, debug_classifier):
        self.forest = RandomForestClassifier(n_estimators=100, random_state=1)
        self.multi_target_forest = MultiOutputClassifier(self.forest,
                                                         n_jobs=-1)
        X_train, X_test, Y_train, Y_test = train_test_split(self.X,
                                                            self.Y,
                                                            test_size=0.1,
                                                            random_state=0)
        self.multi_target_forest.fit(X_train, Y_train)

        forests = self.multi_target_forest.estimators_
        forest0_feat = forests[0].feature_importances_.tolist()
        forest1_feat = forests[1].feature_importances_.tolist()
        forest2_feat = forests[2].feature_importances_.tolist()
        forest3_feat = forests[3].feature_importances_.tolist()

        feature_df = pd.DataFrame(
            data={
                'Features': [x for x in range(5)],
                'Forest0': forest0_feat,
                'Forest1': forest1_feat,
                'Forest2': forest2_feat,
                'Forest3': forest3_feat
            })

        if debug_classifier == True:
            print('Training Score: ',
                  self.multi_target_forest.score(X_train, Y_train))
            print('Test Score: ',
                  self.multi_target_forest.score(X_test, Y_test))

            fig1 = plt.figure()

            ax = fig1.add_subplot(111)

            width = 0.1

            feature_df.Forest0.plot(kind='bar',
                                    color='red',
                                    ax=ax,
                                    width=width,
                                    position=-1)
            feature_df.Forest1.plot(kind='bar',
                                    color='green',
                                    ax=ax,
                                    width=width,
                                    position=0)
            feature_df.Forest2.plot(kind='bar',
                                    color='blue',
                                    ax=ax,
                                    width=width,
                                    position=1)
            feature_df.Forest3.plot(kind='bar',
                                    color='yellow',
                                    ax=ax,
                                    width=width,
                                    position=2)

            ax.set_xticklabels([
                'Yards to First', 'Down', 'Quarter', 'Yardline', 'Score Diff'
            ],
                               rotation=0)
            ax.set_xlabel('Features')
            ax.set_ylabel('Feature Importance')
            ax.set_title('Random Forest - Feature Analysis')

            plt.xlim(-0.5, 4.5)
            plt.legend(['Pass', 'Run', 'Punt', 'Field Goal'])
            plt.show()

    def test_classifier(self, yards_to_go, down, quarter, yard_line,
                        score_diff):

        input_array = np.array(
            [yards_to_go, down, quarter, yard_line, score_diff])
        prediction = self.multi_target_forest.predict_proba(
            input_array).tolist()
        prediction = prediction[0][1]
        return np.argmax(prediction)

    def _generate_lists(self):

        self.play_type = self.team_df['PlayType'].values.tolist()
        self.game_ID = self.team_df['GameID'].values.tolist()
        self.drive = self.team_df['Drive'].values.tolist()
        self.quarter = self.team_df['qtr'].values.tolist()
        self.down = self.team_df['down'].values.tolist()
        self.time = self.team_df['time'].values.tolist()
        self.pos_team = self.team_df['posteam'].values.tolist()
        self.def_team = self.team_df['DefensiveTeam'].values.tolist()
        self.pass_length = self.team_df['PassLength'].values.tolist()
        self.pass_location = self.team_df['PassLocation'].values.tolist()
        self.pass_attempt = self.team_df['PassAttempt'].values.tolist()
        self.air_yards = self.team_df['AirYards'].values.tolist()
        self.rush_attempt = self.team_df['RushAttempt'].values.tolist()
        self.run_location = self.team_df['RunLocation'].values.tolist()
        self.run_gap = self.team_df['RunGap'].values.tolist()
        self.fieldgoal_distance = self.team_df[
            'FieldGoalDistance'].values.tolist()
        self.pos_team_score = self.team_df['PosTeamScore'].values.tolist()
        self.def_team_score = self.team_df['DefTeamScore'].values.tolist()
        self.yrdline100 = self.team_df['yrdline100'].values.tolist()
        self.yrds_to_go = self.team_df['ydstogo'].values.tolist()

    def _organize_training_data(self):

        score_diff_list = np.array(self.pos_team_score) - np.array(
            self.def_team_score)
        zipped_data = zip(self.quarter, self.down, self.yrdline100,
                          self.yrds_to_go, score_diff_list, self.play_type)

        for quarter, down, yrdln, yrds_to_go, score_diff, play_type in zipped_data:

            input_list = [yrds_to_go, down, quarter, yrdln, score_diff]
            if not np.any(np.isnan(
                    input_list)) and play_type in self.valid_play_dict:

                output_list = [0 for _ in range(4)]
                output_list[self.valid_play_dict[play_type]] = 1

                self.X.append(input_list)
                self.Y.append(output_list)

        self.X = np.array(self.X)
        self.Y = np.array(self.Y)

    def generate_success_probabilities(self, opponent, yr, debug_probs=False):
        ##############################
        # Extract Team Specific Data #
        ##############################
        self.opponent = opponent

        valid_dates = [
            str(yr) + '-' + '09',
            str(yr) + '-' + '10',
            str(yr) + '-' + '11',
            str(yr) + '-' + '12',
            str(yr + 1) + '-' + '01'
        ]

        coach_yr_09_df = self.team_df[self.team_df['\ufeffDate'].str.contains(
            valid_dates[0])]
        coach_yr_10_df = self.team_df[self.team_df['\ufeffDate'].str.contains(
            valid_dates[1])]
        coach_yr_11_df = self.team_df[self.team_df['\ufeffDate'].str.contains(
            valid_dates[2])]
        coach_yr_12_df = self.team_df[self.team_df['\ufeffDate'].str.contains(
            valid_dates[3])]
        coach_yr_01_df = self.team_df[self.team_df['\ufeffDate'].str.contains(
            valid_dates[4])]

        coach_yr_df = pd.concat([
            coach_yr_09_df, coach_yr_10_df, coach_yr_11_df, coach_yr_12_df,
            coach_yr_01_df
        ])

        team_prob_df = coach_yr_df[coach_yr_df['DefensiveTeam'] ==
                                   self.opponent]

        loc_pass_outcome = team_prob_df['PassOutcome'].values.tolist()
        loc_yrds_gained = team_prob_df['Yards.Gained'].values.tolist()
        loc_play_type = team_prob_df['PlayType'].values.tolist()
        loc_interception = team_prob_df['InterceptionThrown'].values.tolist()

        loc_play_type_fumble = coach_yr_df['PlayType'].values.tolist()
        loc_fumble = coach_yr_df['Fumble'].values.tolist()
        loc_drive = coach_yr_df['Drive'].values.tolist()
        loc_gameID = coach_yr_df['GameID'].values.tolist()

        loc_fg_success = coach_yr_df['FieldGoalResult']
        loc_fg_distance = coach_yr_df['yrdline100']
        loc_fg_play_type = coach_yr_df['PlayType']

        loc_punt_spot = coach_yr_df['yrdline100']
        loc_punt_return = coach_yr_df['Return_spot']

        loc_time_elapsed = coach_yr_df['Elapsed_Play_Time']

        ########################
        # Initialize Variables #
        ########################
        self.elapsed_time = {
            'punt': [],
            'run': [],
            'pass_good': [],
            'pass_nogood': [],
            'fg': []
        }

        self.total_passes = 0
        self.total_completions = 0
        self.pass_list = []
        self.rush_list = []

        self.pass_or_sack = 0
        self.num_sacks = 0
        self.sack_dist = []

        self.total_interceptions = 0

        field_goal_attempts = {0: 0, 10: 0, 20: 0, 30: 0, 40: 0, 50: 0, 60: 0}
        field_goal_successes = {0: 0, 10: 0, 20: 0, 30: 0, 40: 0, 50: 0, 60: 0}
        self.field_goal_pct = {}

        total_runs = 0
        total_run_fumbles = 0
        total_pass = 0
        total_pass_fumbles = 0

        self.punt_dist = []
        punt_touchback = {
            90: 0,
            80: 0,
            70: 0,
            60: 0,
            50: 0,
            40: 0,
            30: 0,
            20: 0
        }
        punt_kickrange = {
            90: 0,
            80: 0,
            70: 0,
            60: 0,
            50: 0,
            40: 0,
            30: 0,
            20: 0
        }
        punt_total = 0

        #####################
        # Punt Calculations #
        #####################
        for punt_spot, return_spot, time in zip(loc_punt_spot, loc_punt_return,
                                                loc_time_elapsed):
            if np.isnan(punt_spot) == False and np.isnan(return_spot) == False:
                punt_total += 1
                punt_range = np.floor(punt_spot / 10) * 10
                punt_kickrange[punt_range] += 1
                if return_spot == 80:
                    punt_touchback[punt_range] += 1
                else:
                    self.punt_dist.append(return_spot - (100 - punt_spot))
                if np.isnan(time) == False:
                    self.elapsed_time['punt'].append(time)
        self.punt_alpha, self.punt_loc, self.punt_beta = stats.gamma.fit(
            self.punt_dist)
        punt_x = np.arange(-10, 80, 1)
        g3 = gamma.pdf(x=punt_x,
                       a=self.punt_alpha,
                       loc=self.punt_loc,
                       scale=self.punt_beta)

        self.punt_touchback_pct = {}
        for key, value in punt_kickrange.items():
            if value != 0:
                self.punt_touchback_pct[key] = punt_touchback[key] / value

        ###########################
        # Field Goal Calculations #
        ###########################
        for fg_success, fg_distance, fg_play_type, time in zip(
                loc_fg_success, loc_fg_distance, loc_fg_play_type,
                loc_time_elapsed):

            if fg_play_type == 'Field Goal':
                marker = np.floor(fg_distance / 10) * 10
                if marker is not None:
                    if np.isnan(time) == False:
                        self.elapsed_time['fg'].append(time)
                    field_goal_attempts[marker] += 1
                    if fg_success == 'Good':
                        field_goal_successes[marker] += 1

        for key, value in field_goal_attempts.items():
            if value > 0:
                self.field_goal_pct[key] = field_goal_successes[key] / value
            else:
                self.field_goal_pct[key] = 0

        #######################
        # Fumble Calculations #
        #######################
        for i, fumble in enumerate(loc_fumble):
            current_game = loc_gameID[i]
            current_drive = loc_drive[i]
            if loc_play_type_fumble[i] == 'Pass':
                total_pass += 1
                if fumble == 1:
                    if loc_gameID[i + 1] == current_game:
                        if loc_drive[i + 1] == current_drive or loc_drive[
                                i + 1] == current_drive + 1:
                            pass
                        else:
                            total_pass_fumbles += 1
            elif loc_play_type_fumble[i] == 'Run':
                total_runs += 1
                if fumble == 1:
                    if loc_gameID[i + 1] == current_game:
                        if loc_drive[i + 1] == current_drive or loc_drive[
                                i + 1] == current_drive + 1:
                            pass
                        else:
                            total_run_fumbles += 1

        self.pass_fumble_pct = total_pass_fumbles / total_pass
        self.run_fumble_pct = total_run_fumbles / total_runs

        #############################
        # Pass and Run Calculations #
        #############################
        for pass_outcome, yrds_gained, play_type, interception, time in zip(
                loc_pass_outcome, loc_yrds_gained, loc_play_type,
                loc_interception, loc_time_elapsed):

            if play_type == 'Pass' or play_type == 'Sack':
                self.pass_or_sack += 1
                if play_type == 'Sack':
                    self.num_sacks += 1
                    self.sack_dist.append(yrds_gained)

            if play_type == 'Pass':
                self.total_passes += 1
                if pass_outcome == "Complete":
                    self.total_completions += 1
                    self.pass_list.append(yrds_gained)
                    if np.isnan(time) == False:
                        self.elapsed_time['pass_good'].append(time)
                else:
                    if np.isnan(time) == False:
                        self.elapsed_time['pass_nogood'].append(time)
                if interception == 1:
                    self.total_interceptions += 1

            elif play_type == 'Run':
                if np.isnan(time) == False:
                    self.elapsed_time['run'].append(time)
                self.rush_list.append(yrds_gained)

        self.time_kde = {}

        self.time_kde['pass_good'] = stats.gaussian_kde(
            self.elapsed_time['pass_good'], bw_method=.2)
        self.time_kde['pass_nogood'] = stats.gaussian_kde(
            self.elapsed_time['pass_nogood'], bw_method=.2)
        self.time_kde['punt'] = stats.gaussian_kde(self.elapsed_time['punt'],
                                                   bw_method=.2)
        self.time_kde['run'] = stats.gaussian_kde(self.elapsed_time['run'],
                                                  bw_method=.2)
        self.time_kde['fg'] = stats.gaussian_kde(self.elapsed_time['fg'],
                                                 bw_method=.2)

        self.pass_complete_pct = self.total_completions / self.total_passes

        self.pass_alpha, self.pass_loc, self.pass_beta = stats.gamma.fit(
            self.pass_list)
        self.run_alpha, self.run_loc, self.run_beta = stats.gamma.fit(
            self.rush_list)

        self.sack_pct = self.num_sacks / self.pass_or_sack
        self.sack_yrds_mean = np.mean(self.sack_dist)
        self.sack_yrds_std = np.std(self.sack_dist)
        self.interception_pct = self.total_interceptions / self.total_passes

        #############
        # Debugging #
        #############
        if debug_probs == True:
            pass_x = np.arange(0, 40, .1)
            g1 = gamma.pdf(x=pass_x,
                           a=self.pass_alpha,
                           loc=self.pass_loc,
                           scale=self.pass_beta)

            run_x = np.arange(-10, 20, .1)
            g2 = gamma.pdf(x=run_x,
                           a=self.run_alpha,
                           loc=self.run_loc,
                           scale=self.run_beta)

            fig2 = plt.figure()

            ax1 = fig2.add_subplot(2, 1, 1)
            ax1.plot(pass_x, g1)
            ax1.hist(self.pass_list, bins=20, normed=True)
            ax1.set_xlabel('Pass Yards')
            ax1.set_ylabel('Probability')

            ax2 = fig2.add_subplot(2, 1, 2)
            ax2.plot(run_x, g2)
            ax2.hist(self.rush_list, 20, normed=True)
            ax2.set_xlabel('Rush Yards')
            ax2.set_ylabel('Probability')
            fig2.show()

            fig3 = plt.figure()

            ax3 = fig3.add_subplot(1, 1, 1)
            ax3.plot(punt_x, g3)
            ax3.hist(self.punt_dist, bins=20, normed=True)
            fig3.show()

            fig6 = plt.figure()

            ax6 = fig6.add_subplot(1, 1, 1)
            print('TIMES', self.elapsed_time)
            for key, value in self.elapsed_time.items():
                ax6.hist(value, histtype='step', label=key)
            ax6.legend()
            fig6.show()
Пример #13
0
        X_train, X_test, y_train, y_test = train_test_split(
            cat_labelled_data[headers],
            cat_labelled_data[category_dummies_prefix.columns],
            test_size=0.2,
            random_state=42)
        X_train = X_train[attributes_of_interest]
        X_test = X_test[attributes_of_interest]
        y_train = y_train.to_numpy()

        clf.set_params(max_depth=i)

        print(f'Modèle {name} {i}')
        print('-- Entrainement')
        classifier = MultiOutputClassifier(clf, n_jobs=-1)
        classifier.fit(X_train, y_train)
        train_score = classifier.score(X_train, y_train)
        print(f'Score d\'entraînement: {train_score}')

        print('-- Test')
        test_predictions = classifier.predict(X_test)
        test_score = classifier.score(X_test, y_test)
        print(f'Score de test: {test_score}')

        if test_score > best_model.get('score'):
            best_model['name'], best_model['score'], best_model[
                'model'] = name + ' ' + str(i), test_score, classifier

print(
    f"Le modèle présentant le meilleur score est {best_model.get('name')} avec {best_model.get('score')}"
)
Пример #14
0
rf = RandomForestClassifier(random_state=42)  # 랜덤포레스트 분류기
rf.fit(X_train, y_train)  # train data에 random forest model 학습
# rf_predictions = rf.predict(X_test) # 학습된 모델에 X_test 값을 넣어 y_test 예측 값 생성
# print(rf_predictions)

# 분류기 평가 - gridsearchcv 전 rf model
from sklearn.multioutput import MultiOutputClassifier
# 멀티 출력 가능하게 하는 패키지 설치

rf_classifier = MultiOutputClassifier(rf, n_jobs=1)
rf_classifier.fit(X_train, y_train)  # 다중출력이 가능한 모델에 train data 학습

rf_predictions2 = rf_classifier.predict(X_test)  # 학습된 모델에 X_test 넣어서 y_test 예측
print(rf_predictions2)

print(rf_classifier.score(X_train, y_train))  # 훈련 데이터 셋 정확도 94.91%

# GridSearchCV : 교차검증과 최적의 파라미터를 동시에 진행
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [50, 100, 200, 300], 'max_depth': [5, 10, 20]}

forest_reg = RandomForestClassifier()
grid_search = GridSearchCV(forest_reg,
                           param_grid,
                           cv=5,
                           scoring='neg_mean_squared_error')
print(grid_search.fit(X_train, y_train))

print('최고 평균 정확도 : {0:.4f}'.format(grid_search.best_score_))
print('GridSearchCV 최적 파라미터 : ', grid_search.best_params_)
Пример #15
0
def pedicting_tag(request):
    print 'inside predicting tag'
    class lemmatokenizer(object):
        def __init__(self):
            self.stemmer = SnowballStemmer('english')
            self.token_pattern = r"(?u)\b\w\w+\b"       
    #         self.wnl = WordNetLemmatizer()
        def __call__(self,doc):                                                     # here, doc is one string sentence
            token_pattern = re.compile(self.token_pattern)
            return [self.stemmer.stem(t) for t in token_pattern.findall(doc)]       # return lambda doc: token_pattern.findall(doc) 
    #         return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]


    vect_title = CountVectorizer(max_df=0.5,min_df=5,stop_words='english',tokenizer=lemmatokenizer(),ngram_range=(1,3))


    # In[9]:

    tfidf_vect_title = TfidfVectorizer(smooth_idf=False,max_df=0.5,min_df=5,stop_words='english',tokenizer=lemmatokenizer(),ngram_range=(1,3))


    le = preprocessing.LabelEncoder()  
    le.fit(y_labels) 
    d_set['label_num'] = pd.Series([le.transform(ast.literal_eval(i)) for i in d_set['tag']])
    d_set.head()


    new_y_labels = d_set['label_num'].values.tolist()

    mlb = MultiLabelBinarizer() 
    mlb.fit(new_y_labels)

    y_tag_dtm = mlb.transform(new_y_labels) 

    y_tag_dtm.shape


    # In[14]:

    X_labels = d_set['title'].values.tolist()

    # print (X_labels)


    # In[15]:

    vect_title.fit(X_labels)
    X_title_dtm = vect_title.transform(X_labels)

    X_title_dtm


    from sklearn.decomposition import PCA

    pca = PCA(n_components=100).fit(X_title_dtm.toarray())
    pca_samples = pca.transform(X_title_dtm.toarray())

    pca_df = pd.DataFrame(np.round(pca_samples,4))

    print (pca_df.head())


    # In[ ]:




    # In[17]:

    new_df = pd.DataFrame(X_title_dtm.toarray(),columns=vect_title.get_feature_names())



    new_df.shape



    d = collections.Counter(vect_title.get_feature_names())

    new_df['target_list'] = [i for i in y_tag_dtm] 


    tfidf_vect_title.fit(X_labels)
    X_title_dtm_tfidf = tfidf_vect_title.transform(X_labels)

    X_title_dtm_tfidf


    # In[23]:

    new_df_of_tfidf = pd.DataFrame(X_title_dtm_tfidf.toarray(),columns=tfidf_vect_title.get_feature_names()) 


    # In[24]:

    new_df_of_tfidf['target_list'] = [i for i in y_tag_dtm] 


    # In[25]:

    y = new_df_of_tfidf['target_list'] 
    X = new_df_of_tfidf.drop('target_list',axis=1)  


    X = np.array(X.values.tolist())                           # it will convert list to numpy ndarray
    y = np.array(y.values.tolist())


    # In[28]:

    # print (X[0]) 


    # In[29]:

    pca_X = PCA(n_components=200).fit_transform(X)  
    pca_X = np.round(pca_X,4)

    pca_y = PCA(n_components=50).fit_transform(y)  
    pca_y = np.round(pca_y,4)


    # In[30]:

    print (pca_y) 


    # In[31]:

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)   


    # In[32]:

    # X_train, X_test, y_train, y_test = train_test_split(pca_X, pca_y, test_size=0.2, random_state=1)   


    # In[ ]:




    # In[33]:

    # clf = Pipeline([('classifier',OneVsRestClassifier(SVC(probability=True,random_state=0)))])  # just to for Pipeline example

    knn_clf = KNeighborsClassifier(n_neighbors=5)
    # mnb_clf = MultinomialNB()                                                                   # not working for MultiLabelinput
    # svc_clf = OneVsRestClassifier(SVC(probability=True,random_state=0))

    # time_pass_y = np.random.randint(2,size=(2838,1))                                            # produce ndarray of size 2838 X 1

    knn_clf.fit(X_train, y_train)
    # mnb_clf.fit(X_train, y_train) 

    knn_pred = knn_clf.predict(X_test)  
    # mnb_pred = mnb_clf.predict(X_test)
    # svc_pred = svc_clf.predict(X_test)


    # In[34]:

    knn_clf.score(X_test, y_test) 


    # In[53]:

    from sklearn import metrics

    knn_report = metrics.classification_report(y_test[:100], knn_pred[:100]) 
    knn_f1_score = metrics.f1_score(y_test[:], knn_pred[:], average='samples') 
    knn_precision_recall_fscore = metrics.precision_recall_fscore_support(y_test, knn_pred, average='samples')  # on full data-set
    knn_avg_precision_score = metrics.average_precision_score(y_test, knn_pred, average='samples')
    knn_roc_auc_score = metrics.roc_auc_score(y_test, knn_pred, average='samples')

    # mnb_report = metrics.classification_report(y_test[:100], mnb_pred[:100])  #throwing error mnb_clf can't work on multilabel O/P


    # In[36]:

    metrics.accuracy_score(y_true=y_test[:100], y_pred=knn_pred[:100])          # I think it's same as calculating hamming_score


    # In[37]:

    # print (knn_report)                                   # its type is str

    print "For knn_clf (KNearestNeighbours) : "
    print "precision, recall, fbeta_score, support : ",knn_precision_recall_fscore
    print "f1_score : ",knn_f1_score
    print "avg. precision_score : ",knn_avg_precision_score 
    print "roc_auc_score : ",knn_roc_auc_score


    # In[38]:

    # def does_test_tag_match(d, list_of_tags):      # no need for this function


    # In[39]:

    test = ["how to use policy iteration in ml ?"]
    # test = ["what is lstm ?"] 

    # test_dtm = vect_title.transform(test)                                           # without tfidf
    test_dtm = tfidf_vect_title.transform(test)                                       # with tfidf

    # print (test_dtm.toarray()[0])
    status = False
    for i in test_dtm.toarray()[0]:
        if (i!=0):
            status = True
            break

    ans = knn_clf.predict(test_dtm.toarray())
    ans = mlb.inverse_transform(ans)

    if (len(ans[0])==0 or status==False):
        print ("sorry, we can't predict your category!!!")
    else:
        ans = le.inverse_transform(ans)
        print (ans)
        
        

    forest = RandomForestClassifier(n_estimators=100, random_state=0)
    rf_clf = MultiOutputClassifier(forest, n_jobs=-1)
    rf_clf.fit(X_train, y_train)
    rf_pred = rf_clf.predict(X_test)


    # In[41]:

    rf_clf 


    # In[42]:

    metrics.accuracy_score(y_true=y_test[:100], y_pred=rf_pred[:100])          # I think it's same as calculating hamming_score


    # In[43]:

    rf_clf.score(X_test, y_test)

    rf_report = metrics.classification_report(y_test[:100], rf_pred[:100])
    rf_f1_score = metrics.f1_score(y_test, rf_pred, average='samples')  
    rf_precision_recall_fscore = metrics.precision_recall_fscore_support(y_test, rf_pred, average='samples')  # on full data-set
    rf_avg_precision_score = metrics.average_precision_score(y_test, rf_pred, average='samples')
    rf_roc_auc_score = metrics.roc_auc_score(y_test, rf_pred, average='samples') 


    # In[47]:

    # print (rf_report) 

    print "For rf_clf (RandomForest) : "
    print "precision, recall, fbeta_score, support : ",rf_precision_recall_fscore
    print "f1_score : ",rf_f1_score  
    print "avg. precision_score : ",rf_avg_precision_score 
    print "roc_auc_score : ",rf_roc_auc_score

    # test = ["what is reinforcement learning ?"] 

    test = ["what is ai,lstm and data visualization ?"] 

    # test_dtm = vect_title.transform(test)                                            # without tfidf
    test_dtm = tfidf_vect_title.transform(test)                                        # with tfidf

    status = False
    for i in test_dtm.toarray()[0]:
        if (i!=0):
            status = True
            break

    ans = rf_clf.predict(test_dtm.toarray())
    ans = mlb.inverse_transform(ans)
    if (len(ans[0])==0 or status==False):
        print ("sorry, we can't predict your category!!!")
    else:
        ans = le.inverse_transform(ans)
        print (ans)
Пример #16
0
# Split Train/Test
###############################################################################
(inputs, outputs) = (DATA[FEATS], DATA[['CPT', 'WOP']])
(TRN_X, VAL_X, TRN_Y, VAL_Y) = train_test_split(
    inputs, outputs, 
    test_size=float(VT_SPLIT),
    stratify=outputs
)
(TRN_L, VAL_L) = [i.shape[0] for i in (TRN_X, VAL_X)]
###############################################################################
# Define Model
###############################################################################
rf = RandomForestClassifier(
    n_estimators=TREES, max_depth=DEPTH, criterion='entropy',
    min_samples_split=5, min_samples_leaf=50,
    max_features=None, max_leaf_nodes=None,
    n_jobs=JOB
)
clf = MultiOutputClassifier(rf)
# K-fold training -------------------------------------------------------------
kScores = cross_val_score(clf, TRN_X, TRN_Y)
kScores
###############################################################################
# Train Model
###############################################################################
clf.fit(TRN_X, TRN_Y)
# Predict ---------------------------------------------------------------------
PRD_Y = clf.predict(VAL_X)
clf.score(VAL_X, VAL_Y)

Пример #17
0
        if 'z' in col:
            y_train_regr[col] = y_train[col]
            y_test_regr[col] = y_test[col]
        else:
            y_train_clf[col] = y_train[col]
            y_test_clf[col] = y_test[col]

    mo_clf = MultiOutputClassifier(rs_clf)

    # Fit the data to the models
    print('Fitting data')
    mo_clf.fit(x_train, y_train_clf)
    rs_regr.fit(x_train, y_train_regr)

    # Print the results of the fit on the test data
    print('Test classification score: %.3f' % mo_clf.score(x_test, y_test_clf))
    print('Test regression R2 score: %.3f' %
          rs_regr.score(x_test, y_test_regr))

    # Plot the decision surfaces of the classifier and regressor
    x = pd.DataFrame(np.linspace(0, 5, 25))
    y = pd.DataFrame(np.linspace(0, 5, 25))

    # Create a grid to plot our predicted values over
    surf_x = pd.DataFrame(np.array(np.meshgrid(
        x,
        y,
    )).T.reshape(-1, 2))
    surf_z = pd.DataFrame()

    # Predict a value for each (x, y) pair in the grid
X_test = PCA(n_components=2).fit_transform(X_test)

ax2.set_title('Test labels')
ax2.scatter(X_test[:, 0],
            X_test[:, 1],
            c=np.sum(Y_test * np.array([1, 2, 3, 4, 5]), axis=1))
ax2.set_xlabel('Feature 0 count')

forest = RandomForestClassifier(n_estimators=100, random_state=1)
decision = DecisionTreeClassifier()

# training step
multi_target_R = MultiOutputClassifier(forest, n_jobs=-1)
result_R = multi_target_R.fit(X, Y)
result_R = multi_target_R.predict(X_test)
score_R = multi_target_R.score(X_test, Y_test)

multi_target_D = MultiOutputClassifier(decision, n_jobs=-1)
multi_target_D = multi_target_D.fit(X, Y)
result_D = multi_target_D.predict(X_test)
score_D = multi_target_D.score(X_test, Y_test)

# Plot classification result
ax3.scatter(X_test[:, 0],
            X_test[:, 1],
            c=np.sum(result_D * np.array([1, 2, 3, 4, 5]), axis=1))
ax3.set_title('Decision Tree labels %0.2f' % score_D)
ax3.set_ylabel('Feature 1 count')
ax3.set_xlabel('Feature 0 count')
X_w_D = []
for i in range(len(result_D)):
import numpy as np
import pandas as pd
from simulations.irs_v2x_simulation import IRSV2XSimulation
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from joblib import dump, load

data = pd.read_csv('data_position_simulation.csv')
irs_antnum = 256

cols_x = [IRSV2XSimulation.COL_POS_X, IRSV2XSimulation.COL_POS_Y]
# IRSV2XSimulation.COL_POS_Z,
# IRSV2XSimulation.COL_SPEED]
cols_y = []
for n in range(irs_antnum):
    cols_y.append(IRSV2XSimulation.COL_PHASE + str(n))
X = data[cols_x].to_numpy()
Y = data[cols_y].to_numpy()
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

forest = RandomForestClassifier(n_estimators=100, random_state=1)
classifier = MultiOutputClassifier(forest, n_jobs=-1)
classifier.fit(X[0:100, :], Y[0:100, :])
dump(classifier, 'classifier.joblib')
print(classifier.score(X, Y))
Пример #20
0
y_train = df_train[categories].values

mltout_clf_svm = MultiOutputClassifier(svm.SVC(),
                                       n_jobs=-1).fit(x_train, y_train)
mltout_clf_per = MultiOutputClassifier(Perceptron(),
                                       n_jobs=-1).fit(x_train, y_train)

data_test = arff.loadarff('scene-test.arff')
df_test = pd.DataFrame(data_test[0])
df_test.replace(converte, inplace=True)
x_test = df_test.drop(categories, axis=1)
y_test = df_test[categories]

# Gera o array com as predições para cada exemplo
y_pred_svm = mltout_clf_svm.predict(x_test)

# Acurácia do método
print("Acurácia:", mltout_clf_svm.score(x_test, y_test))

# Matriz de confusão para cada rótulo
plot_confusion_matrix(y_test, y_pred_svm, classes=[0, 1], title=categories)

# Gera o array com as predições para cada exemplo
y_pred_per = mltout_clf_per.predict(x_test)

# Acurácia do método
print("Acurácia: ", mltout_clf_per.score(x_test, y_test))

# Matriz de confusão para cada rótulo
plot_confusion_matrix(y_test, y_pred_svm, classes=[0, 1], title=categories)
Пример #21
0
bestc = []
c = 0.00001
while c < 10003:
    print("*")
    biii_model = MultiOutputClassifier(LinearSVC(penalty='l1',C=c,dual=False),n_jobs=-1)
    print(c)
    score = cross_val_score(biii_model,StrainX,trainy,cv=10,n_jobs=-1)
    print(score.mean())
    bestc.append(score.mean())
    c = c * 10

biii_model = MultiOutputClassifier(LinearSVC(penalty='l1',C=1,dual=False),n_jobs=-1)
biii_model.fit(StrainX,trainy)
predl1 = biii_model.predict(StestX)
print("Test Accuracy is",biii_model.score(StestX,testy))

print("Hamming Loss",hamloss(testy, predl1))
hamlossL(testy, predl1)

print("Exact Match Score",exactmatch(testy, predl1))
exactmatchL(testy, predl1)


bestd = []
c = 0.00001
while c < 10003:
    print("*")
    biv_model = MultiOutputClassifier(LinearSVC(penalty='l1',C=c,dual=False,class_weight='balanced'),n_jobs=-1)
    print(c)
    score = cross_val_score(biv_model,StrainX,trainy,cv=10,n_jobs=-1)
Пример #22
0
question_tfidftransformed_acp = question_acp.transform(
    question_tfidftransformed)
answer_tfidftransformed_acp = answer_acp.transform(answer_tfidftransformed)

X_transformed = sp.hstack(
    [question_tfidftransformed_acp, answer_tfidftransformed_acp, X_category])

X_train, X_test, y_train, y_test = train_test_split(X_transformed,
                                                    y_transformed,
                                                    test_size=0.15)

dtc = DecisionTreeClassifier(max_depth=5)
dtc_multi = MultiOutputClassifier(dtc, n_jobs=-1)

dtc_multi.fit(X_train, y_train)
dtc_multi.score(X_test, y_test)


def transform_split_score(X, y):
    if isinstance(X, list):
        X_transformed = sp.hstack(
            X[text_var].apply(lambda col: vectorizer.fit_transform(col)))
    X_transformed = vectorizer.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_transformed,
                                                        y,
                                                        test_size=0.15)
    dtc_multi.fit(X_train, y_train)
    return dtc_multi.score(X_test, y_test)


# tree interpretation
Пример #23
0
def main():

    # Script argument parsing
    parser = argparse.ArgumentParser(
        description=
        'Homework 03 - Machine learning a.a. 2018/19 - Predict missing values',
        epilog=' coded by: Emanuele Palombo')

    parser.add_argument('dataset_name',
                        metavar='DATASET',
                        type=str,
                        nargs='?',
                        default=__default_ts_name,
                        help='{} (default {}) - dataset name'.format(
                            list(__ts_opts.keys()), __default_ts_name))

    parser.add_argument(
        '--test-size',
        '-t',
        dest='test_size',
        action='store',
        metavar='TEST_SIZE',
        type=float,
        default=__default_test_size,
        help='[0-1] (default {}) - splitting size of TestSet'.format(
            __default_test_size))

    parser.add_argument(
        '--question-marks-ts',
        '-q',
        dest='qm_repeted_ts',
        action='store',
        type=int,
        default=__default_question_mark_count_repeated,
        help=
        '{{0,1,2...}} (default {}) - (this value * {} * samples) added to TrainingSet'
        .format(__default_question_mark_count_repeated,
                __default_question_mark_count))

    parser.add_argument(
        '--no-split',
        '-s',
        dest='no_split',
        action='store_true',
        default=__default_no_split,
        help='(default {}) - keep whole DataSet for training'.format(
            __default_no_split))

    parser.add_argument('--img-tag',
                        '-i',
                        dest='img_tag',
                        action='store',
                        type=str,
                        default='',
                        help='string - add arbitrary string to saved images')

    parser.add_argument(
        '--verbose',
        '-v',
        dest='verbosity',
        action='count',
        default=__default_training_verbosity,
        help='add more verbosity to output (repeat it to increase)')

    args = parser.parse_args()

    if args.dataset_name not in __ts_opts:
        print('ERROR: Choose correct DataSet!\n')
        parser.print_help()
        exit(1)

    trainingset_selected_name = args.dataset_name
    test_size = args.test_size
    qm_repeted_ts = args.qm_repeted_ts
    dataset_no_split = args.no_split
    training_verbosity = args.verbosity
    img_tag = args.img_tag
    running_id = id_generator()

    ts_selected_opts = __ts_opts[trainingset_selected_name]
    # End script argument parsing

    print('\nDataSet selected: ' + ts_selected_opts['url'])

    # read dataset to pandas dataframe
    dataset = pd.read_csv(ts_selected_opts['url'],
                          names=ts_selected_opts['columns'])

    if training_verbosity >= 1:
        print('\nFirst five rows of DataSet:\n')
        print(dataset.head())
        print('\nDataSet Length: {}'.format(len(dataset)))

    # DataSet Manipulation
    # remove row with question marks (this avoid to have '?' on the output)
    dataset = dataset[~(dataset.astype(str) == '?').any(1)]

    # strip out (remove) the "real output" (y)
    dataset = dataset.iloc[ts_selected_opts['x_slice'][0],
                           ts_selected_opts['x_slice'][1]]

    # Different approach to value conversion
    # convert all column to int (str => int)
    # dataset = dataset.apply(lambda x: pd.factorize(x)[0] + 1)
    # convert all columns to int
    dataset = dataset.astype(int)

    # dataSet Information
    features_count = len(dataset.columns)
    features_values = ds_features_values(dataset)

    # copy input features to output (columns * 2)
    for column in dataset.columns:
        dataset['y_' + column] = dataset[column]

    # Split DataSet
    training_set, test_set = train_test_split(
        dataset,
        test_size=test_size,
        random_state=__default_train_test_split_random_state)

    # check feature values between TrainingSet and TestSet
    # it's important avoid more value on TestSet (ie. error on log_loss for mismatch in predict_proba size)
    if not check_labels_split(features_count, training_set, test_set):
        exit(1)

    # Concat (add row) TrainingSet and TestSet
    # in this case model could see all sample (included queries without '?')
    if dataset_no_split:
        training_set = pd.concat([training_set, test_set], axis=0)

        print('\nTraining over the whole DataSet')
    else:
        print('\nSplit DataSet in TrainingSet and TestSet (test size: {})'.
              format(test_size))

    # add (append) question mark
    # append qm_count rows, with 1 to qm_count '?'
    qm_count = int(ts_selected_opts['question_mark_count'])
    for i in range(qm_repeted_ts):
        for value_count in range(1, qm_count + 1):
            training_set = ds_mod_with_value(training_set, value_count,
                                             features_count, True)

            if training_verbosity >= 1:
                print(
                    '{} Added {} question mark (?) to TrainingSet for each sample'
                    .format(i, value_count))

    # Shuffle TrainingSet
    training_set = training_set.sample(frac=1)

    if training_verbosity >= 1:
        print('\nManipulated TrainingSet:\n')
        print(training_set.head())
        print('\nTrainingSet Length: {}'.format(len(training_set)))

    # TrainingSet: input X (features) and Output y ("mirrored" features))
    x_train = training_set.iloc[:, 0:features_count]
    y_train = training_set.iloc[:, features_count:]

    # TestSet: input X (features) and Output y ("mirrored" features))
    x_test = test_set.iloc[:, 0:features_count]
    y_test = test_set.iloc[:, features_count:]

    if training_verbosity >= 2:
        print('\nInput train:\n {}'.format(x_train.head()))
        print('\nOutput train:\n {}'.format(y_train.head()))
        print('\nInput test:\n {}'.format(x_test.head()))
        print('\nOutput test:\n {}'.format(y_test.head()))

    x_train = x_train.values
    y_train = y_train.values
    y_test = y_test.values

    # oneHot encoding (characteristic vector)
    # passing features_values without None force OneHotEncoder to transform None to null vector
    one_hot_encoder = OneHotEncoder(categories=features_values,
                                    handle_unknown='ignore')
    one_hot_encoder.fit(x_train)
    x_train_encoded = one_hot_encoder.transform(x_train).toarray()

    if training_verbosity >= 2:
        print('\nOneHotEncoding...\nexample: {} => {}'.format(
            x_train[0], x_train_encoded[0]))

    # store all results/metrics for each model/classifier
    results = {}

    for classifier_name in __deafult_model_classifier:

        filename = 'model_{}_{}.sav'.format(trainingset_selected_name,
                                            classifier_name)

        if os.path.isfile(filename):
            # load module already trained
            multi_output_classifier = joblib.load(filename)

            print(
                '\n### Model {} loaded by file: {}\nImportant: remove the file to re-train the model!'
                .format(classifier_name, filename))
        else:
            n_jobs = None
            model_verbosity = True if training_verbosity >= 3 else False

            if classifier_name == 'MLP':
                classifier = MLPClassifier(hidden_layer_sizes=ts_selected_opts[
                    'mlp_hidden_layers_sizes'],
                                           max_iter=1000,
                                           verbose=model_verbosity)
            elif classifier_name == 'KNN':
                n_jobs = None
                classifier = KNeighborsClassifier(
                    n_neighbors=ts_selected_opts['knn_k'])
            elif classifier_name == 'SVM':
                classifier = SVC(gamma='scale',
                                 decision_function_shape='ovo',
                                 probability=True,
                                 verbose=model_verbosity)
            elif classifier_name == 'RandomForest':
                classifier = RandomForestClassifier(
                    n_estimators=ts_selected_opts['random_forest_estimator'],
                    verbose=model_verbosity)

            print('\n### Init and training the model: {}'.format(
                classifier_name))

            # init MultiOutput for classifier
            multi_output_classifier = MultiOutputClassifier(classifier,
                                                            n_jobs=n_jobs)
            multi_output_classifier.fit(x_train_encoded, y_train)

            # save the model to disk
            joblib.dump(multi_output_classifier, filename)

        results[classifier_name] = collections.defaultdict(list)
        metris_result = results[classifier_name]

        # create input test (query) with different number of '?'
        for query_count_question_mark in range(
                ts_selected_opts['question_mark_count'] + 1):

            print('\n## Add {} questions mark to input test (query)'.format(
                query_count_question_mark))

            # modify (in place) input test with question marks
            x_test_with_qm = ds_mod_with_value(
                x_test.copy(),
                value_count=query_count_question_mark,
                append=False)

            if training_verbosity >= 2:
                print('\nInput test (query):\n {}'.format(
                    pd.DataFrame(data=x_test_with_qm).head()))

            # encode the input test
            x_test_encoded = one_hot_encoder.transform(
                x_test_with_qm).toarray()

            # compute output prediction and probability
            y_pred = multi_output_classifier.predict(x_test_encoded)
            y_pred_proba = multi_output_classifier.predict_proba(
                x_test_encoded)
            # precision on whole output
            score = multi_output_classifier.score(x_test_encoded, y_test)
            # the Hamming loss corresponds to the Hamming distance between y_test and y_pred
            hamming_loss = np.sum(np.not_equal(y_test, y_pred)) / float(
                y_test.size)

            # compute y_test and y_pred how if the out was only the query question marks
            y_test_reduced, y_pred_reduced = reduce_y_to_qm(
                x_test_with_qm, y_test, y_pred)

            # write y_pred_proba to file (csv)
            write_pred_proba(
                y_pred_proba,
                '{}{}-{}-q{}-{}{}.csv'.format(__default_csv_path,
                                              trainingset_selected_name,
                                              classifier_name,
                                              query_count_question_mark,
                                              running_id, img_tag))

            print('\nMetrics:')
            print(' {:<30} | {:^10} | {:>10}'.format('features', 'accuracy',
                                                     'log loss'))
            print('-' * (30 + 10 + 10 + 7))

            log_loss_avg = 0
            # for each output column => compute accuracy and log_loss
            for feature_index in range(y_test.shape[1]):
                y_test_column = y_test[:, feature_index]
                y_pred_column = y_pred[:, feature_index]

                accuracy = accuracy_score(y_test_column, y_pred_column)
                # note: for avoid error here was implemented check_labels_split()
                log_loss_value = log_loss(
                    y_test_column,
                    y_pred_proba[feature_index],
                    labels=features_values[feature_index])

                print(' {:<30} | {:^10.4f} | {:>10.4f}'.format(
                    test_set.columns[feature_index], accuracy, log_loss_value))

                log_loss_avg += log_loss_value

                metris_result['accuracy_' +
                              str(feature_index)].append(accuracy)
                metris_result['log_loss_' +
                              str(feature_index)].append(log_loss_value)

            print('\nVirtual reduced output:')
            # for each output reduced (only question marks) => compute accuracy
            for index in range(query_count_question_mark):
                accuracy = accuracy_score(y_test_reduced[:, index],
                                          y_pred_reduced[:, index])
                print(' accuracy {}:   {:>10.4f}'.format(index, accuracy))

                metris_result['accuracy_reduced_' +
                              str(index)].append(accuracy)

            print('\nAll output:')
            print(' accuracy:     {:>10.4f}'.format(score))
            print(' log_loss avg: {:>10.4f}'.format(log_loss_avg /
                                                    y_test.shape[1]))
            print(' hamming loss: {:>10.4f}'.format(hamming_loss))

            metris_result['accuracy'].append(score)
            metris_result['log_loss_avg'].append(log_loss_avg /
                                                 y_test.shape[1])
            metris_result['hamming_loss'].append(hamming_loss)

        # GRAPH PLOT per model/classifier
        plot_line_graph(range(ts_selected_opts['question_mark_count'] + 1), [
            results[classifier_name]['accuracy'],
            results[classifier_name]['log_loss_avg'],
            results[classifier_name]['hamming_loss']
        ],
                        labels=['accuracy', 'log loss avg', 'hamming loss'],
                        fmt=['bo-', 'ro-', 'yo-'],
                        title=classifier_name,
                        xlabel='Number of Question Marks in the query',
                        ymax=1)

        if __default_save_img:
            plt.savefig('{}{}-{}-{}{}.png'.format(__default_imgs_path,
                                                  trainingset_selected_name,
                                                  classifier_name, running_id,
                                                  img_tag),
                        dpi=200)

        # create list of list of accuracy x feature
        accuracy_lst = [
            'accuracy_' + str(index) for index in range(features_count)
        ]
        accuracy_lst = [
            results[classifier_name][accuracy_key]
            for accuracy_key in accuracy_lst
        ]

        plot_line_graph(range(ts_selected_opts['question_mark_count'] + 1),
                        [results[classifier_name]['accuracy']] + accuracy_lst,
                        fmt=['bo-'] + ['g.--'] * len(accuracy_lst),
                        title=classifier_name +
                        ': whole accuracy and those by features',
                        xlabel='Number of Question Marks in the query',
                        ylabel='accuracy',
                        ymax=1)

        if __default_save_img:
            plt.savefig('{}{}-{}-accuracy-{}{}.png'.format(
                __default_imgs_path, trainingset_selected_name,
                classifier_name, running_id, img_tag),
                        dpi=200)

        # create list of list of accuracy_reduced x feature (adding 0 in front when needed)
        accuracy_reduced_lst = [
            'accuracy_reduced_' + str(index)
            for index in range(ts_selected_opts['question_mark_count'])
        ]
        accuracy_reduced_lst = [
            results[classifier_name][accuracy_reduced]
            for accuracy_reduced in accuracy_reduced_lst
        ]
        accuracy_reduced_lst = [[None] *
                                (ts_selected_opts['question_mark_count'] -
                                 len(accuracy_reduced) + 1) + accuracy_reduced
                                for accuracy_reduced in accuracy_reduced_lst]

        plot_line_graph(
            range(ts_selected_opts['question_mark_count'] + 1),
            [results[classifier_name]['accuracy']] + accuracy_reduced_lst,
            fmt=['bo-'] + ['m.--'] * len(accuracy_reduced_lst),
            title=classifier_name +
            ': whole accuracy and the virtual accuracies by features',
            xlabel='Number of Question Marks in the query',
            ylabel='accuracy',
            ymax=1)

        if __default_save_img:
            plt.savefig('{}{}-{}-accuracy-reduced-{}{}.png'.format(
                __default_imgs_path, trainingset_selected_name,
                classifier_name, running_id, img_tag),
                        dpi=200)

        # create list of list of log_loss x feature
        log_loss_lst = [
            'log_loss_' + str(index) for index in range(features_count)
        ]
        log_loss_lst = [
            results[classifier_name][log_loss_key]
            for log_loss_key in log_loss_lst
        ]

        plot_line_graph(
            range(ts_selected_opts['question_mark_count'] + 1),
            [results[classifier_name]['log_loss_avg']] + log_loss_lst,
            fmt=['ro-'] + ['c.--'] * len(log_loss_lst),
            title=classifier_name + ': average log loss and those by features',
            xlabel='Number of Question Marks in the query',
            ylabel='log loss')

        if __default_save_img:
            plt.savefig('{}{}-{}-log-loss-{}{}.png'.format(
                __default_imgs_path, trainingset_selected_name,
                classifier_name, running_id, img_tag),
                        dpi=200)

    metrics_by_classifier = [
        results[classifier][metric]
        for classifier in __deafult_model_classifier
        for metric in ['accuracy', 'log_loss_avg', 'hamming_loss']
    ]
    label_by_classifier = [
        classifier + ' ' + metric for classifier in __deafult_model_classifier
        for metric in ['accuracy', 'log_loss_avg', 'hamming_loss']
    ]
    fmt_lst = [
        style.replace('0', character)
        for character in ['o', '^', 'v', '<', '>', '.', ',', '+', 'x']
        for style in ['b0-', 'r0-', 'y0-']
    ]

    # GRAPH PLOT comparing model/classifier
    plot_line_graph(range(ts_selected_opts['question_mark_count'] + 1),
                    metrics_by_classifier,
                    labels=label_by_classifier,
                    fmt=fmt_lst,
                    title='Compare all model',
                    xlabel='Number of Question Marks in the query',
                    ylabel='',
                    ymax=1)

    if __default_save_img:
        plt.savefig('{}{}-comparing-{}{}.png'.format(
            __default_imgs_path, trainingset_selected_name, running_id,
            img_tag),
                    dpi=200)

    if not __default_save_img:
        plt.show()
Пример #24
0
class Igel(object):
    """
    Igel is the base model to use the fit, evaluate and predict functions of the sklearn library
    """

    available_commands = ('fit', 'evaluate', 'predict', 'experiment')
    supported_types = ('regression', 'classification', 'clustering')
    results_path = configs.get('results_path')  # path to the results folder
    default_model_path = configs.get(
        'default_model_path')  # path to the pre-fitted model
    description_file = configs.get(
        'description_file')  # path to the description.json file
    evaluation_file = configs.get(
        'evaluation_file')  # path to the evaluation.json file
    prediction_file = configs.get(
        'prediction_file')  # path to the predictions.csv
    default_dataset_props = configs.get(
        'dataset_props'
    )  # dataset props that can be changed from the yaml file
    default_model_props = configs.get(
        'model_props')  # model props that can be changed from the yaml file
    model = None

    def __init__(self, **cli_args):
        logger.info(f"Entered CLI args: {cli_args}")
        logger.info(f"Executing command: {cli_args.get('cmd')} ...")
        self.data_path: str = cli_args.get('data_path')  # path to the dataset
        logger.info(f"reading data from {self.data_path}")
        self.command = cli_args.get('cmd', None)
        if not self.command or self.command not in self.available_commands:
            raise Exception(f"You must enter a valid command.\n"
                            f"available commands: {self.available_commands}")

        if self.command == "fit":
            self.yml_path = cli_args.get('yaml_path')
            file_ext = self.yml_path.split('.')[-1]
            logger.info(f"You passed the configurations as a {file_ext} file.")

            self.yaml_configs = read_yaml(
                self.yml_path) if file_ext == 'yaml' else read_json(
                    self.yml_path)
            logger.info(f"your chosen configuration: {self.yaml_configs}")

            # dataset options given by the user
            self.dataset_props: dict = self.yaml_configs.get(
                'dataset', self.default_dataset_props)
            # model options given by the user
            self.model_props: dict = self.yaml_configs.get(
                'model', self.default_model_props)
            # list of target(s) to predict
            self.target: list = self.yaml_configs.get('target')

            self.model_type: str = self.model_props.get('type')
            logger.info(f"dataset_props: {self.dataset_props} \n"
                        f"model_props: {self.model_props} \n "
                        f"target: {self.target} \n")

        # if entered command is evaluate or predict, then the pre-fitted model needs to be loaded and used
        else:
            self.model_path = cli_args.get('model_path',
                                           self.default_model_path)
            logger.info(f"path of the pre-fitted model => {self.model_path}")
            # load description file to read stored training parameters
            with open(self.description_file, 'r') as f:
                dic = json.load(f)
                self.target: list = dic.get(
                    "target")  # target to predict as a list
                self.model_type: str = dic.get(
                    "type"
                )  # type of the model -> regression or classification
                self.dataset_props: dict = dic.get(
                    'dataset_props')  # dataset props entered while fitting
        getattr(self, self.command)()

    def _create_model(self, **kwargs):
        """
        fetch a model depending on the provided type and algorithm by the user and return it
        @return: class of the chosen model
        """
        model_type: str = self.model_props.get('type')
        model_algorithm: str = self.model_props.get('algorithm')
        use_cv = self.model_props.get('use_cv_estimator', None)

        model_args = None
        if not model_type or not model_algorithm:
            raise Exception(f"model_type and algorithm cannot be None")
        algorithms: dict = models_dict.get(
            model_type)  # extract all algorithms as a dictionary
        model = algorithms.get(
            model_algorithm)  # extract model class depending on the algorithm
        logger.info(
            f"Solving a {model_type} problem using ===> {model_algorithm}")
        if not model:
            raise Exception("Model not found in the algorithms list")
        else:
            model_props_args = self.model_props.get('arguments', None)
            if model_props_args and type(model_props_args) == dict:
                model_args = model_props_args
            elif not model_props_args or model_props_args.lower() == "default":
                model_args = None

            if use_cv:
                model_class = model.get('cv_class', None)
                if model_class:
                    logger.info(
                        f"cross validation estimator detected. "
                        f"Switch to the CV version of the {model_algorithm} algorithm"
                    )
                else:
                    logger.info(
                        f"No CV class found for the {model_algorithm} algorithm"
                    )
            else:
                model_class = model.get('class')
            logger.info(f"model arguments: \n"
                        f"{self.model_props.get('arguments')}")
            model = model_class(**kwargs) if not model_args else model_class(
                **model_args)
            return model, model_args

    def _save_model(self, model):
        """
        save the model to a binary file
        @param model: model to save
        @return: bool
        """
        try:
            if not os.path.exists(self.results_path):
                logger.info(
                    f"creating model_results folder to save results...\n"
                    f"path of the results folder: {self.results_path}")
                os.mkdir(self.results_path)
            else:
                logger.info(f"Folder {self.results_path} already exists")
                logger.warning(
                    f"data in the {self.results_path} folder will be overridden. If you don't "
                    f"want this, then move the current {self.results_path} to another path"
                )

        except OSError:
            logger.exception(
                f"Creating the directory {self.results_path} failed ")
        else:
            logger.info(
                f"Successfully created the directory in {self.results_path} ")
            pickle.dump(model, open(self.default_model_path, 'wb'))
            return True

    def _load_model(self, f: str = ''):
        """
        load a saved model from file
        @param f: path to model
        @return: loaded model
        """
        try:
            if not f:
                logger.info(f"result path: {self.results_path} ")
                logger.info(f"loading model form {self.default_model_path} ")
                model = pickle.load(open(self.default_model_path, 'rb'))
            else:
                logger.info(f"loading from {f}")
                model = pickle.load(open(f, 'rb'))
            return model
        except FileNotFoundError:
            logger.error(f"File not found in {self.default_model_path} ")

    def _prepare_fit_data(self):
        return self._process_data(target='fit')

    def _prepare_eval_data(self):
        return self._process_data(target='evaluate')

    def _process_data(self, target='fit'):
        """
        read and return data as x and y
        @return: list of separate x and y
        """
        assert isinstance(self.target,
                          list), "provide target(s) as a list in the yaml file"
        if self.model_type != "clustering":
            assert len(
                self.target) > 0, "please provide at least a target to predict"

        try:
            read_data_options = self.dataset_props.get('read_data_options',
                                                       None)
            dataset = pd.read_csv(
                self.data_path) if not read_data_options else pd.read_csv(
                    self.data_path, **read_data_options)
            logger.info(f"dataset shape: {dataset.shape}")
            attributes = list(dataset.columns)
            logger.info(f"dataset attributes: {attributes}")

            # handle missing values in the dataset
            preprocess_props = self.dataset_props.get('preprocess', None)
            if preprocess_props:
                # handle encoding
                encoding = preprocess_props.get('encoding')
                if encoding:
                    encoding_type = encoding.get('type', None)
                    column = encoding.get('column', None)
                    if column in attributes:
                        dataset, classes_map = encode(
                            df=dataset,
                            encoding_type=encoding_type.lower(),
                            column=column)
                        if classes_map:
                            self.dataset_props[
                                'label_encoding_classes'] = classes_map
                            logger.info(
                                f"adding classes_map to dataset props: \n{classes_map}"
                            )
                        logger.info(
                            f"shape of the dataset after encoding => {dataset.shape}"
                        )

                # preprocessing strategy: mean, median, mode etc..
                strategy = preprocess_props.get('missing_values')
                if strategy:
                    dataset = handle_missing_values(dataset, strategy=strategy)
                    logger.info(
                        f"shape of the dataset after handling missing values => {dataset.shape}"
                    )

            if target == 'predict' or target == 'fit_cluster':
                x = _reshape(dataset.to_numpy())
                if not preprocess_props:
                    return x
                scaling_props = preprocess_props.get('scale', None)
                if not scaling_props:
                    return x
                else:
                    scaling_method = scaling_props.get('method', None)
                    return normalize(x, method=scaling_method)

            if any(col not in attributes for col in self.target):
                raise Exception(
                    "chosen target(s) to predict must exist in the dataset")

            y = pd.concat([dataset.pop(x) for x in self.target], axis=1)
            x = _reshape(dataset.to_numpy())
            y = _reshape(y.to_numpy())
            logger.info(f"y shape: {y.shape} and x shape: {x.shape}")

            # handle data scaling
            if preprocess_props:
                scaling_props = preprocess_props.get('scale', None)
                if scaling_props:
                    scaling_method = scaling_props.get('method', None)
                    scaling_target = scaling_props.get('target', None)
                    if scaling_target == 'all':
                        x = normalize(x, method=scaling_method)
                        y = normalize(y, method=scaling_method)
                    elif scaling_target == 'inputs':
                        x = normalize(x, method=scaling_method)
                    elif scaling_target == 'outputs':
                        y = normalize(y, method=scaling_method)

            if target == 'evaluate':
                return x, y

            split_options = self.dataset_props.get('split', None)
            if not split_options:
                return x, y, None, None
            test_size = split_options.get('test_size')
            shuffle = split_options.get('shuffle')
            stratify = split_options.get('stratify')
            x_train, x_test, y_train, y_test = train_test_split(
                x,
                y,
                test_size=test_size,
                shuffle=shuffle,
                stratify=None
                if not stratify or stratify.lower() == "default" else stratify)

            return x_train, y_train, x_test, y_test

        except Exception as e:
            logger.exception(
                f"error occured while preparing the data: {e.args}")

    def _prepare_clustering_data(self):
        """
        preprocess data for the clustering algorithm
        """
        return self._process_data(target='fit_cluster')

    def _prepare_predict_data(self):
        """
        preprocess predict data to get similar data to the one used when training the model
        """
        return self._process_data(target='predict')

    def get_evaluation(self, model, x_test, y_true, y_pred, **kwargs):
        res = None
        try:
            res = evaluate_model(model_type=self.model_type,
                                 model=model,
                                 x_test=x_test,
                                 y_pred=y_pred,
                                 y_true=y_true,
                                 get_score_only=False,
                                 **kwargs)
        except Exception as e:
            res = evaluate_model(model_type=self.model_type,
                                 model=model,
                                 x_test=x_test,
                                 y_pred=y_pred,
                                 y_true=y_true,
                                 get_score_only=True,
                                 **kwargs)
        return res

    def fit(self, **kwargs):
        """
        fit a machine learning model and save it to a file along with a description.json file
        @return: None
        """
        x_train = None
        x_test = None
        y_train = None
        y_test = None
        cv_results = None
        eval_results = None
        cv_params = None
        if self.model_type == 'clustering':
            x_train = self._prepare_clustering_data()
        else:
            x_train, y_train, x_test, y_test = self._prepare_fit_data()
        self.model, model_args = self._create_model(**kwargs)
        logger.info(
            f"executing a {self.model.__class__.__name__} algorithm...")

        # convert to multioutput if there is more than one target to predict:
        if self.model_type != 'clustering' and len(self.target) > 1:
            logger.info(
                f"predicting multiple targets detected. Hence, the model will be automatically "
                f"converted to a multioutput model")
            self.model = MultiOutputClassifier(self.model) \
                if self.model_type == 'classification' else MultiOutputRegressor(self.model)

        if self.model_type != 'clustering':
            cv_params = self.model_props.get('cross_validate', None)
            if not cv_params:
                logger.info(f"cross validation is not provided")
            else:
                cv_results = cross_validate(estimator=self.model,
                                            X=x_train,
                                            y=y_train,
                                            **cv_params)
            self.model.fit(x_train, y_train)
        else:
            self.model.fit(x_train)

        saved = self._save_model(self.model)
        if saved:
            logger.info(
                f"model saved successfully and can be found in the {self.results_path} folder"
            )

        if self.model_type == 'clustering':
            eval_results = self.model.score(x_train)
        else:
            if x_test is None:
                logger.info(
                    f"no split options was provided. training score will be calculated"
                )
                eval_results = self.model.score(x_train, y_train)

            else:
                logger.info(
                    f"split option detected. The performance will be automatically evaluated "
                    f"using the test data portion")
                y_pred = self.model.predict(x_test)
                eval_results = self.get_evaluation(model=self.model,
                                                   x_test=x_test,
                                                   y_true=y_test,
                                                   y_pred=y_pred,
                                                   **kwargs)

        fit_description = {
            "model": self.model.__class__.__name__,
            "arguments": model_args if model_args else "default",
            "type": self.model_props['type'],
            "algorithm": self.model_props['algorithm'],
            "dataset_props": self.dataset_props,
            "model_props": self.model_props,
            "data_path": self.data_path,
            "train_data_shape": x_train.shape,
            "test_data_shape": None if x_test is None else x_test.shape,
            "train_data_size": x_train.shape[0],
            "test_data_size": None if x_test is None else x_test.shape[0],
            "results_path": str(self.results_path),
            "model_path": str(self.default_model_path),
            "target": None if self.model_type == 'clustering' else self.target,
            "results_on_test_data": eval_results
        }
        if self.model_type == 'clustering':
            clustering_res = {
                "cluster_centers": self.model.cluster_centers_,
                "cluster_labels": self.model.labels_
            }
            fit_description['clustering_results'] = clustering_res

        if cv_params:
            cv_res = {
                "fit_time": cv_results['fit_time'].tolist(),
                "score_time": cv_results['score_time'].tolist(),
                "test_score": cv_results['test_score'].tolist()
            }
            fit_description['cross_validation_params'] = cv_params
            fit_description['cross_validation_results'] = cv_res

        try:
            logger.info(f"saving fit description to {self.description_file}")
            with open(self.description_file, 'w', encoding='utf-8') as f:
                json.dump(fit_description, f, ensure_ascii=False, indent=4)
        except Exception as e:
            logger.exception(
                f"Error while storing the fit description file: {e}")

    def evaluate(self, **kwargs):
        """
        evaluate a pre-fitted model and save results to a evaluation.json
        @return: None
        """
        x_val = None
        y_true = None
        eval_results = None
        try:
            model = self._load_model()
            if self.model_type != 'clustering':
                x_val, y_true = self._prepare_eval_data()
                y_pred = model.predict(x_val)
                eval_results = self.get_evaluation(model=model,
                                                   x_test=x_val,
                                                   y_true=y_true,
                                                   y_pred=y_pred,
                                                   **kwargs)
            else:
                x_val = self._prepare_clustering_data()
                y_pred = model.predict(x_val)
                eval_results = model.score(x_val, y_pred)

            logger.info(f"saving fit description to {self.evaluation_file}")
            with open(self.evaluation_file, 'w', encoding='utf-8') as f:
                json.dump(eval_results, f, ensure_ascii=False, indent=4)

        except Exception as e:
            logger.exception(f"error occured during evaluation: {e}")

    def predict(self):
        """
        use a pre-fitted model to make predictions and save them as csv
        @return: None
        """
        try:
            model = self._load_model(f=self.model_path)
            x_val = self._prepare_predict_data(
            )  # the same is used for clustering
            y_pred = model.predict(x_val)
            y_pred = _reshape(y_pred)
            logger.info(
                f"predictions shape: {y_pred.shape} | shape len: {len(y_pred.shape)}"
            )
            logger.info(f"predict on targets: {self.target}")
            df_pred = pd.DataFrame.from_dict({
                self.target[i]: y_pred[:,
                                       i] if len(y_pred.shape) > 1 else y_pred
                for i in range(len(self.target))
            })

            logger.info(f"saving the predictions to {self.prediction_file}")
            df_pred.to_csv(self.prediction_file)

        except Exception as e:
            logger.exception(f"Error while preparing predictions: {e}")

    @staticmethod
    def create_init_mock_file(model_type=None,
                              model_name=None,
                              target=None,
                              *args,
                              **kwargs):
        path = configs.get('init_file_path', None)
        if not path:
            raise Exception("You need to provide a path for the init file")

        dataset_props = Igel.default_dataset_props
        model_props = Igel.default_model_props
        if model_type:
            logger.info(f"user selected model type = {model_type}")
            model_props['type'] = model_type
        if model_name:
            logger.info(f"user selected algorithm = {model_name}")
            model_props['algorithm'] = model_name

        logger.info(f"initalizing a default igel.yaml in {path}")
        default_data = {
            "dataset":
            dataset_props,
            "model":
            model_props,
            "target": ['provide your target(s) here']
            if not target else [tg for tg in target.split()]
        }
        created = create_yaml(default_data, path)
        if created:
            logger.info(
                f"a default igel.yaml is created for you in {path}. "
                f"you just need to overwrite the values to meet your expectations"
            )
        else:
            logger.warning(
                f"something went wrong while initializing a default file")
Пример #25
0
    datasetY.append(indices)#np.array(indices).astype('int'))

mlb = MultiLabelBinarizer()#classes=len(radionuclides))
datasetY = mlb.fit_transform(datasetY)

#datasetX = StandardScaler().fit_transform(datasetX)
X_train, X_test, y_train, y_test = \
    train_test_split(datasetX, datasetY, test_size=.4, random_state=42)

#print(y_train)
#print(type(y_train))
#y_train = y_train.astype('int')
#y_test = y_test.astype('int')

#print(X_train)
#print(y_train)

classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print(score)

# predict
inv = ag.UnstablesInventory(data=[
    (db.getzai(radionuclides[2]), ACTIVITY),
    (db.getzai(radionuclides[0]), ACTIVITY),
    (db.getzai(radionuclides[5]), ACTIVITY),
    (db.getzai(radionuclides[3]), ACTIVITY)
])
hist, _ = lc(inv, spectype=SPECTYPE)
print(classifier.predict([[1 if bin > 0 else 0 for bin in hist ]]))