Exemplo n.º 1
0
    def train(self,
              df=pd.DataFrame(),
              validation='eval',
              n_splits=5,
              b_smoketest=True,
              n_frac=1):
        env = Environment()
        enc = Word_Encoder()
        df_train = df
        bgm_columns = env.bgm_columns_list(mode=1)
        drop_columns = [
            'word', 'gram', 's_suffix2', 's_suffix3', 's_prefix2', 's_prefix3',
            'n_token'
        ]  #, 'bgm_l_None'
        #drop_columns.extend(['bgm_l_%s' % (i) for i in range(1, env.bgm_columns_max()) if 'bgm_l_%s' % (i) not in bgm_columns])
        env.debug(1,
                  ['POStagger', 'train',
                   'Drop colums: %s' % (drop_columns)])

        if df_train.empty:
            t_start = timer()
            df_train = self.tokenz()
            t_end = timer()
            env.debug(1, [
                'POSTagger', 'train', 'tokenz loaded:', 'time:',
                env.job_time(t_start, t_end)
            ])

        env.debug(1, [
            'POStagger', 'train',
            'All tokenz set shape %s' % df_train.shape[0]
        ])
        t_start = timer()
        env.debug(1, ['POStagger', 'train', 'Learning: START'])
        if n_frac < 1:
            df_train = df_train.sample(frac=n_frac)
            env.debug(1, [
                'POStagger', 'train',
                'Training tokenz set shape %s' % df_train.shape[0]
            ])
            #print(df_train.shape)

        #df_train2 = df_train[bgm_columns]
        #print(df_train2.shape)
        #df_train2 = df_train2.astype({"idgram": int})
        df_train = df_train.drop(columns=drop_columns, axis=1)
        env.debug(
            1, ['POStagger',
                'Train colums: %s' % (df_train.columns.tolist())])
        #print(df_train.columns)

        #df_train = df_train.drop_duplicates() #slow-slow
        #print(df_train.head())

        df_train = df_train.fillna(0)
        file_x = env.filename_xtrain_csv()
        df_train.to_csv(file_x, encoding='utf-8')
        env.debug(1, ['POStagger', 'train', 'Save X', file_x])
        y = df_train['idgram'].values
        df_train.drop(columns=['idgram'], inplace=True)
        X = df_train.values
        #array = df_train.values
        #print(df_train)
        #X = array[:, 1:]
        #Y = array[:, 0]

        #print(X, Y)
        #validation_size = 0.20
        seed = 241
        frac_test_size = 0.2

        sc = StandardScaler()
        #Y_sc = sc.fit_transform(Y)
        t2_start = timer()
        if validation == 'cv':  #Need cross-validation
            scoring = 'accuracy'
            # scoring = 'f1_samples'
            kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
            if True:  #Decision tree
                env.debug(1, ['Tree cross-validation'])
                # clf = DecisionTreeClassifier(criterion='gini', random_state=seed)  # 0.79
                # clf = KNeighborsClassifier(n_neighbors=230)
                model = DecisionTreeClassifier(criterion='entropy',
                                               random_state=seed)  # 0.81
                env.debug(
                    1, ['Calculate cross_val_score. Splits=%s' % (n_splits)])
                scores = cross_val_score(model, X, y, cv=kf)
                print('DTree scores:', scores.mean(), 'raw', scores)

            if False:  #Logistic regression
                env.debug(1, ['LGR cross-validation'])
                n_Cs = [0.01]
                X = array[:, 5:]
                X_sc = sc.fit_transform(X)
                Y = df_train['idgram'].values
                Y[Y > 0] = 1
                print(X_sc, Y)
                for n_c in n_Cs:
                    #clf = LogisticRegression(penalty='l2', solver='saga', C=n_c, multi_class='multinomial')
                    clf = LogisticRegression(penalty='l2',
                                             solver='liblinear',
                                             C=n_c)
                    # clf = SVC(kernel='linear', C=10000, random_state=241)
                    # clf = SVC(kernel='linear', C=0.01, random_state=seed)
                    # clf = SVC(random_state=seed)
                    # clf = Perceptron()
                    env.debug(1, [
                        'Calculate cross_val_score. Splits=%s C=%s' %
                        (n_splits, n_c)
                    ])
                    scores = cross_val_score(clf, X_sc, Y, cv=kf)
                    print(scores)

            if False:  #GBM, RandomForest
                env.debug(1, ['GBM cross-validation'])
                asteps = [20]  #GBM
                #asteps=[100] #RandomForest
                for i in asteps:
                    #clf = RandomForestClassifier(n_estimators=i)
                    clf = GradientBoostingClassifier(
                        n_estimators=i, max_depth=8)  #, max_features='sqrt'
                    env.debug(1, [
                        'Calculate cross_val_score. Splits=%s Estimators=%s' %
                        (n_splits, i)
                    ])
                    scores = cross_val_score(clf, X, Y, cv=kf)
                    print(scores)

        if validation == 'eval':
            # eval
            model = xgb.XGBClassifier(n_estimators=140,
                                      max_depth=16,
                                      colsample=1,
                                      subsample=0.5,
                                      seed=seed)
            X_train, X_test, y_train, y_test = train_test_split(
                X,
                y,
                test_size=frac_test_size,
                random_state=seed,
                shuffle=True)
            eval_set = [(X_train, y_train), (X_test, y_test)]
            # print(eval_set)
            f_eval = 'merror'
            # f_eval = 'mlogloss'
            model.fit(X_train,
                      y_train,
                      eval_metric=f_eval,
                      eval_set=eval_set,
                      verbose=False,
                      early_stopping_rounds=20)
            ev_scores = model.evals_result()
            ev_mean = np.array(ev_scores['validation_0'][f_eval]).mean()
            #print(model.feature_importances_)
            print(ev_mean, ev_scores)
            xgb.plot_importance(model)
            plt.show()
        t2_end = timer()
        t_end = timer()
        env.debug(1, ['CV completed:', 'time:', env.job_time(t_start, t_end)])

        if validation == 'cv':
            #Training на всех данных
            X_train, y_train = X, y

            # model = SVC()
            # model= DecisionTreeClassifier() #79
            # model= LinearDiscriminantAnalysis() #47
            # model=LogisticRegression() #48
            # model = KNeighborsClassifier(n_neighbors=200) #48
            # model = GaussianNB()   #43
            #print('Fit...')

            #print('Validate...')
            # predictions = model.predict(X_validation)

            # print(accuracy_score(Y_validation, predictions))
            # print(confusion_matrix(Y_validation, predictions))
            # print(classification_report(Y_validation, predictions))

            t_start = timer()
            env.debug(1, ['Training: START'])
            model.fit(X_train, y_train)
            t_end = timer()
            env.debug(1, ['Training: END', env.job_time(t_start, t_end)])

        pickle.dump(sc, open(env.filename_scaler(), 'wb'))
        pickle.dump(model, open(env.filename_model_tree(), 'wb'))

        # Smoke test
        if b_smoketest:
            X_smoke_predict = [
                'съеште', 'ещё', 'этих', 'мягких', 'французских', 'булок'
            ]
            a_smoke = np.array(
                [enc.word2token(elem) for elem in X_smoke_predict])
            y_predictions = model.predict(a_smoke[:, 0:])
            y_predictions_proba = model.predict(a_smoke[:, 0:])
            #print(y_predictions)
            print('Prediction', list(zip(X_smoke_predict, y_predictions)))
            print('Proba', list(zip(X_smoke_predict, y_predictions_proba)))
        return model