示例#1
0
    def word2token(self, s):
        t_start = timer()
        env = Environment()
        bgm_columns = env.bgm_columns_list(mode=1)
        n_shift = 5

        a_result=np.zeros(len(bgm_columns)+n_shift)
        a_result[0] = len(s)
        a_result[1] = self.s_encode(s[-2:])  # ts2
        a_result[2] = self.s_encode(s[-3:])  # ts3
        a_result[3] = self.s_encode(s[2:])  # tp2
        a_result[4] = self.s_encode(s[3:])  # tp3

        t_end = timer()
        #env.debug(1, ['WordEncoder', 'word2token', '%s without bgm takes %s sec.' % (s, env.job_time(t_start, t_end))])
        #t_start = timer()

        di_letters = env.di_bgm_byletters
        #print(di_letters)
        di_word = {}
        for n_l in range(0, len(s) - 1):
            n_l2 = n_l + 1
            di_n = di_letters.get('%s%s' % (s[n_l], s[n_l2]))
            #print('%s%s' % (s[n_l], s[n_l2]),di_n)
            if di_n is not None:
                #print(di_n)
                a_result[di_n + n_shift] = 1
        t_end = timer()
        #env.debug(1, ['WordEncoder', 'word2token', '%s takes %s sec.' % (s, env.job_time(t_start, t_end))])
        return a_result
示例#2
0
 def tokenz_create_stat(self, dftokenz=pd.DataFrame(), n_frac=1):
     env = Environment()
     enc = Word_Encoder()
     di_letters = Environment.di_bgm_byletters
     bgm_columns = env.bgm_columns_list(mode=1)
     t_start = timer()
     if dftokenz.empty:
         dftokenz = self.tokenz()
     if n_frac < 1:
         dftokenz = dftokenz.sample(frac=n_frac)
     env.debug(1, [
         'POStagger', 'create_stat',
         'Collecting statistic START %s words' % dftokenz.shape[0]
     ])
     di_tokenz_stat = (dftokenz.count()).to_dict()
     di_tokenz_res = {}
     #print('di_letters', di_letters)
     print('di_tokenz_stat', di_tokenz_stat)
     bgm_astat = [['init', 0]]
     bgm_index = []
     for key in di_letters:
         di_n = di_letters.get(key)
         column_stat = di_tokenz_stat.get(bgm_columns[di_n])
         #di_tokenz_res[key] = column_stat
         bgm_astat.append([key, column_stat])
         bgm_index.append(di_n)
     bgm_astat = bgm_astat[1:]
     print('column stat', bgm_astat)
     df_bgm_stat = pd.DataFrame(data=bgm_astat,
                                columns=['bigram', 'counts'],
                                index=bgm_index)
     df_bgm_stat.index.name = 'idbigram'
     df_bgm_stat = df_bgm_stat.sort_values(by=['counts'], ascending=False)
     print('bgm_stat\n', df_bgm_stat)
     df_bgm_stat.to_csv(env.filename_stat_bigram_letters_csv(),
                        encoding='utf-8')
示例#3
0
    def train(self,
              df=pd.DataFrame(),
              validation='eval',
              n_splits=5,
              b_smoketest=True,
              n_frac=1):
        env = Environment()
        enc = Word_Encoder()
        df_train = df
        bgm_columns = env.bgm_columns_list(mode=1)
        drop_columns = [
            'word', 'gram', 's_suffix2', 's_suffix3', 's_prefix2', 's_prefix3',
            'n_token'
        ]  #, 'bgm_l_None'
        #drop_columns.extend(['bgm_l_%s' % (i) for i in range(1, env.bgm_columns_max()) if 'bgm_l_%s' % (i) not in bgm_columns])
        env.debug(1,
                  ['POStagger', 'train',
                   'Drop colums: %s' % (drop_columns)])

        if df_train.empty:
            t_start = timer()
            df_train = self.tokenz()
            t_end = timer()
            env.debug(1, [
                'POSTagger', 'train', 'tokenz loaded:', 'time:',
                env.job_time(t_start, t_end)
            ])

        env.debug(1, [
            'POStagger', 'train',
            'All tokenz set shape %s' % df_train.shape[0]
        ])
        t_start = timer()
        env.debug(1, ['POStagger', 'train', 'Learning: START'])
        if n_frac < 1:
            df_train = df_train.sample(frac=n_frac)
            env.debug(1, [
                'POStagger', 'train',
                'Training tokenz set shape %s' % df_train.shape[0]
            ])
            #print(df_train.shape)

        #df_train2 = df_train[bgm_columns]
        #print(df_train2.shape)
        #df_train2 = df_train2.astype({"idgram": int})
        df_train = df_train.drop(columns=drop_columns, axis=1)
        env.debug(
            1, ['POStagger',
                'Train colums: %s' % (df_train.columns.tolist())])
        #print(df_train.columns)

        #df_train = df_train.drop_duplicates() #slow-slow
        #print(df_train.head())

        df_train = df_train.fillna(0)
        file_x = env.filename_xtrain_csv()
        df_train.to_csv(file_x, encoding='utf-8')
        env.debug(1, ['POStagger', 'train', 'Save X', file_x])
        y = df_train['idgram'].values
        df_train.drop(columns=['idgram'], inplace=True)
        X = df_train.values
        #array = df_train.values
        #print(df_train)
        #X = array[:, 1:]
        #Y = array[:, 0]

        #print(X, Y)
        #validation_size = 0.20
        seed = 241
        frac_test_size = 0.2

        sc = StandardScaler()
        #Y_sc = sc.fit_transform(Y)
        t2_start = timer()
        if validation == 'cv':  #Need cross-validation
            scoring = 'accuracy'
            # scoring = 'f1_samples'
            kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
            if True:  #Decision tree
                env.debug(1, ['Tree cross-validation'])
                # clf = DecisionTreeClassifier(criterion='gini', random_state=seed)  # 0.79
                # clf = KNeighborsClassifier(n_neighbors=230)
                model = DecisionTreeClassifier(criterion='entropy',
                                               random_state=seed)  # 0.81
                env.debug(
                    1, ['Calculate cross_val_score. Splits=%s' % (n_splits)])
                scores = cross_val_score(model, X, y, cv=kf)
                print('DTree scores:', scores.mean(), 'raw', scores)

            if False:  #Logistic regression
                env.debug(1, ['LGR cross-validation'])
                n_Cs = [0.01]
                X = array[:, 5:]
                X_sc = sc.fit_transform(X)
                Y = df_train['idgram'].values
                Y[Y > 0] = 1
                print(X_sc, Y)
                for n_c in n_Cs:
                    #clf = LogisticRegression(penalty='l2', solver='saga', C=n_c, multi_class='multinomial')
                    clf = LogisticRegression(penalty='l2',
                                             solver='liblinear',
                                             C=n_c)
                    # clf = SVC(kernel='linear', C=10000, random_state=241)
                    # clf = SVC(kernel='linear', C=0.01, random_state=seed)
                    # clf = SVC(random_state=seed)
                    # clf = Perceptron()
                    env.debug(1, [
                        'Calculate cross_val_score. Splits=%s C=%s' %
                        (n_splits, n_c)
                    ])
                    scores = cross_val_score(clf, X_sc, Y, cv=kf)
                    print(scores)

            if False:  #GBM, RandomForest
                env.debug(1, ['GBM cross-validation'])
                asteps = [20]  #GBM
                #asteps=[100] #RandomForest
                for i in asteps:
                    #clf = RandomForestClassifier(n_estimators=i)
                    clf = GradientBoostingClassifier(
                        n_estimators=i, max_depth=8)  #, max_features='sqrt'
                    env.debug(1, [
                        'Calculate cross_val_score. Splits=%s Estimators=%s' %
                        (n_splits, i)
                    ])
                    scores = cross_val_score(clf, X, Y, cv=kf)
                    print(scores)

        if validation == 'eval':
            # eval
            model = xgb.XGBClassifier(n_estimators=140,
                                      max_depth=16,
                                      colsample=1,
                                      subsample=0.5,
                                      seed=seed)
            X_train, X_test, y_train, y_test = train_test_split(
                X,
                y,
                test_size=frac_test_size,
                random_state=seed,
                shuffle=True)
            eval_set = [(X_train, y_train), (X_test, y_test)]
            # print(eval_set)
            f_eval = 'merror'
            # f_eval = 'mlogloss'
            model.fit(X_train,
                      y_train,
                      eval_metric=f_eval,
                      eval_set=eval_set,
                      verbose=False,
                      early_stopping_rounds=20)
            ev_scores = model.evals_result()
            ev_mean = np.array(ev_scores['validation_0'][f_eval]).mean()
            #print(model.feature_importances_)
            print(ev_mean, ev_scores)
            xgb.plot_importance(model)
            plt.show()
        t2_end = timer()
        t_end = timer()
        env.debug(1, ['CV completed:', 'time:', env.job_time(t_start, t_end)])

        if validation == 'cv':
            #Training на всех данных
            X_train, y_train = X, y

            # model = SVC()
            # model= DecisionTreeClassifier() #79
            # model= LinearDiscriminantAnalysis() #47
            # model=LogisticRegression() #48
            # model = KNeighborsClassifier(n_neighbors=200) #48
            # model = GaussianNB()   #43
            #print('Fit...')

            #print('Validate...')
            # predictions = model.predict(X_validation)

            # print(accuracy_score(Y_validation, predictions))
            # print(confusion_matrix(Y_validation, predictions))
            # print(classification_report(Y_validation, predictions))

            t_start = timer()
            env.debug(1, ['Training: START'])
            model.fit(X_train, y_train)
            t_end = timer()
            env.debug(1, ['Training: END', env.job_time(t_start, t_end)])

        pickle.dump(sc, open(env.filename_scaler(), 'wb'))
        pickle.dump(model, open(env.filename_model_tree(), 'wb'))

        # Smoke test
        if b_smoketest:
            X_smoke_predict = [
                'съеште', 'ещё', 'этих', 'мягких', 'французских', 'булок'
            ]
            a_smoke = np.array(
                [enc.word2token(elem) for elem in X_smoke_predict])
            y_predictions = model.predict(a_smoke[:, 0:])
            y_predictions_proba = model.predict(a_smoke[:, 0:])
            #print(y_predictions)
            print('Prediction', list(zip(X_smoke_predict, y_predictions)))
            print('Proba', list(zip(X_smoke_predict, y_predictions_proba)))
        return model
示例#4
0
    def tokenize(self, dftokenz=pd.DataFrame(), persistent=True, n_frac=1):
        env = Environment()
        enc = Word_Encoder()
        t_start = timer()
        if dftokenz.empty:
            dftokenz = self.tokenz()
        if n_frac < 1:
            dftokenz = dftokenz.sample(frac=n_frac)
        env.debug(
            1, ['Transforming to tokenz: START %s words' % dftokenz.shape[0]])

        gmask = dftokenz.groupby(['gram'])
        df_posstat = gmask.count()
        df_posstat.to_csv(env.filename_stat_pos_tokenz_csv(), encoding='utf-8')
        print('POSTagger', 'train dataset stat:\n', gmask.count())

        fields = [
            's_suffix2', 's_suffix3', 's_prefix2', 's_prefix3', 'n_token',
            'n_len', 'n_tokens2', 'n_tokens3', 'n_tokenp2', 'n_tokenp3'
        ]

        for field in fields:
            val = 0.0
            if field[0] == 's':
                val = ''
            dftokenz[field] = val

        n_letters = 0
        s_letters = env.list_rus_letters()
        di_letters = env.di_bgm_byletters
        #bgm_columns_i = env.bgm_columns_list(mode=0)
        bgm_columns = env.bgm_columns_list(mode=1)

        #print('bgm_columns', bgm_columns)
        for column_name in bgm_columns:
            dftokenz[column_name] = None

        t_end = timer()
        env.debug(1, [
            'POStagger', 'Letters bigram columns added',
            env.job_time(t_start, t_end)
        ])

        #Form tokenz
        t_start = timer()
        for index, serie in dftokenz.iterrows():
            # print (serie.values)
            a_word = enc.s2token(index, serie)
            i = 2
            # print(a_word)
            for field in fields:
                dftokenz.at[index, field] = a_word[i]
                # print(field, a_word[i])
                i = i + 1
            # print(dftokenz.loc[index])
            #Letters bigram binaries
            for n_l in range(0, len(a_word[0]) - 1):
                n_l2 = n_l + 1
                di_n = di_letters.get('%s%s' %
                                      (a_word[0][n_l], a_word[0][n_l2]))
                if di_n is not None:
                    #print(di_n)
                    #print(bgm_columns[di_n])
                    dftokenz.at[index, bgm_columns[di_n]] = 1
        t_end = timer()
        env.debug(
            1,
            ['Transforming to tokenz: COMPLETE',
             env.job_time(t_start, t_end)])
        if persistent:
            dftokenz.to_csv(env.filename_tokenz_csv(), encoding='utf-8')
            env.debug(1, ['Tokenz written to CSV:', env.filename_tokenz_csv()])
        return dftokenz