Пример #1
0
 def grammemes_xml2csv(self, persistent=True):
     env = Environment()
     filename_gram = env.filename_grammemes_xml()
     dfcols = ['name', 'alias', 'description']
     df_xml = pd.DataFrame(columns=dfcols)
     try:
         tree = ET.ElementTree(file=filename_gram)
     except:
         env.debug(1, ['Failed to load grammemes from XML:', filename_gram])
     else:
         env.debug(1, ['Read grammemes:', filename_gram])
         for elem in tree.iter('grammeme'):
             #print(elem.tag, elem.attrib)
             sattr = elem.attrib.get('include')
             if sattr == 'on':
                 sname = sali = sdesc = ''
                 for child in elem:
                     if child.tag.lower() == 'name':
                         sname = child.text.upper()
                     elif child.tag.lower() == 'alias':
                         sali = child.text.upper()
                     elif child.tag.lower() == 'description':
                         sdesc = child.text.lower()
                 s = pd.Series(data=[sname, sali, sdesc], index=dfcols)
                 df_xml = df_xml.append(s, ignore_index=True)
         df_xml.index.name = 'idgram'
         if persistent:
             filename_csv = env.filename_grammemes_csv()
             env.debug(1, ['Write grammemes to CSV:', filename_csv])
             df_xml.to_csv(filename_csv, encoding='utf-8')
     return df_xml
Пример #2
0
 def vocabulary_from_corpus(self, n_min=1, n_max=10, persistent=True):
     env = Environment()
     df_voc = pd.DataFrame()
     #dfgram = self.grammemes()
     for i in range(n_min, n_max + 1):
         file_csv = env.filename_corpus_csv(i)
         try:
             dffile = pd.read_csv(file_csv,
                                  index_col='idcorpus',
                                  encoding='utf-8')
         except:
             env.debug(1, ['Failed to read corpus file:', file_csv])
         else:
             env.debug(1, ['Read OK:', file_csv])
             if not dffile.empty:
                 df_voc = df_voc.append(dffile)
     df_voc = df_voc.drop_duplicates()
     df_voc.columns = ['word', 'gram', 'idgram']
     df_voc = df_voc.reset_index(drop=True)
     df_voc.index.name = 'idcorpus'
     if persistent:
         file_voc = env.filename_vocabulary_csv()
         env.debug(1, ['Write vocabulary to CSV:', file_voc])
         df_voc.to_csv(file_voc, encoding='utf-8')
     return df_voc
Пример #3
0
    def get_texts_stat(self, mode='train'):
        # Готовим данные
        env = Environment()
        if mode == 'train':
            file_res = env.filename_results_csv()
        if mode == 'test':
            file_res = env.filename_stat_test_csv()
        authors = pd.read_csv(env.filename_authors_csv(),
                              index_col='idauthor',
                              encoding='utf-8')

        data = pd.read_csv(file_res, index_col='idstat', encoding='utf-8')
        data.drop(columns=['file', 'idchunk'], inplace=True)
        columns = data.columns

        group = data.groupby(['idtext', 'idauthor', 'author', 'name'])
        group = group.agg({
            'sentences_text': ['mean'],
            'words_text': ['mean'],
            'sentence_mean': ['mean'],
            'sentences_chunk': ['mean'],
            'words_chunk': ['mean'],
            'words_uniq_chunk': ['mean'],
            'uniq_per_sent_chunk': ['mean'],
            'uniq_per_words_chunk': ['mean'],
            'NOUN': ['mean'],
            'ADJF': ['mean'],
            'ADJS': ['mean'],
            'COMP': ['mean'],
            'VERB': ['mean'],
            'INFN': ['mean'],
            'PRTF': ['mean'],
            'PRTS': ['mean'],
            'GRND': ['mean'],
            'NUMR': ['mean'],
            'ADVB': ['mean'],
            'NPRO': ['mean'],
            'PRED': ['mean'],
            'PREP': ['mean'],
            'CONJ': ['mean'],
            'PRCL': ['mean'],
            'INTJ': ['mean'],
            'predict': ['sum']
        })
        group.columns = columns[4:]
        group.reset_index(inplace=True)
        data = pd.merge(group,
                        authors,
                        on='idauthor',
                        how='left',
                        suffixes=('', '_author'))
        if mode == 'test':
            data['predict'] = data['predict'].astype(int)
        data = pd.merge(data,
                        authors,
                        left_on='predict',
                        right_on='idauthor',
                        how='left',
                        suffixes=('', '_predict'))
        return data
Пример #4
0
    def word2token(self, s):
        t_start = timer()
        env = Environment()
        bgm_columns = env.bgm_columns_list(mode=1)
        n_shift = 5

        a_result=np.zeros(len(bgm_columns)+n_shift)
        a_result[0] = len(s)
        a_result[1] = self.s_encode(s[-2:])  # ts2
        a_result[2] = self.s_encode(s[-3:])  # ts3
        a_result[3] = self.s_encode(s[2:])  # tp2
        a_result[4] = self.s_encode(s[3:])  # tp3

        t_end = timer()
        #env.debug(1, ['WordEncoder', 'word2token', '%s without bgm takes %s sec.' % (s, env.job_time(t_start, t_end))])
        #t_start = timer()

        di_letters = env.di_bgm_byletters
        #print(di_letters)
        di_word = {}
        for n_l in range(0, len(s) - 1):
            n_l2 = n_l + 1
            di_n = di_letters.get('%s%s' % (s[n_l], s[n_l2]))
            #print('%s%s' % (s[n_l], s[n_l2]),di_n)
            if di_n is not None:
                #print(di_n)
                a_result[di_n + n_shift] = 1
        t_end = timer()
        #env.debug(1, ['WordEncoder', 'word2token', '%s takes %s sec.' % (s, env.job_time(t_start, t_end))])
        return a_result
Пример #5
0
    def model_train(self):
        env = Environment()
        data = self.stat()
        t_start = timer()
        y, X = self.model_prepare_data(data)

        seed = 241
        scoring = 'accuracy'
        n_splits = 4
        frac_test_size = 0.25

        #Cross-validation
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
        #clf = DecisionTreeClassifier(criterion='gini', random_state=seed)
        #clf = GradientBoostingClassifier(n_estimators=50)
        model = xgb.XGBClassifier(n_estimators=400,
                                  max_depth=24,
                                  colsample=1,
                                  subsample=1,
                                  seed=seed)
        cv_scores = cross_val_score(model, X, y, cv=kf)

        #eval
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=frac_test_size, random_state=seed)
        eval_set = [(X_train, y_train), (X_test, y_test)]
        #print(eval_set)
        f_eval = 'merror'
        # f_eval = 'mlogloss'
        model.fit(X_train,
                  y_train,
                  eval_metric=f_eval,
                  eval_set=eval_set,
                  verbose=False,
                  early_stopping_rounds=10)
        ev_scores = model.evals_result()

        cv_mean = np.array(cv_scores.mean())
        #ev_mean = np.array(ev_scores['validation_0']['mlogloss']).mean()
        ev_mean = np.array(ev_scores['validation_0'][f_eval]).mean()

        #Посмотрим важность признаков в модели
        #print(model.feature_importances_)
        xgb.plot_importance(model)
        #plt.bar(range(len(model.feature_importances_)), model.feature_importances_)
        plt.show()

        #Обучаем модель на всех данных
        model.fit(X, y, verbose=False)
        #Сохраняем модель на диск
        pickle.dump(model, open(env.filename_model_texts(), 'wb'))

        #print('CV', cv_scores, 'EV', ev_scores)
        print('Cross-validation: mean', cv_mean, 'eval_set mean', ev_mean)
        return model
Пример #6
0
	def visit_CombinatorNode(self, node):
		env = Environment()
		for parameter in node.parameters():
			env.add(str(parameter))
		d = env.index
		self.visit('E', node.body(), env = env)
		self.code.Update(d)
		self.code.Pop(d)
		self.code.Unwind()
		self.symtab[node.name()].code = self.code.clone()
		self.code.clear()
Пример #7
0
 def visit_CombinatorNode(self, node):
     env = Environment()
     for parameter in node.parameters():
         env.add(str(parameter))
     d = env.index
     self.visit('E', node.body(), env=env)
     self.code.Update(d)
     self.code.Pop(d)
     self.code.Unwind()
     self.symtab[node.name()].code = self.code.clone()
     self.code.clear()
Пример #8
0
 def model_predict(self, df, b_retrain=False):
     env = Environment()
     y, X = self.model_prepare_data(df, mode='test')
     if b_retrain:
         model = self.model_train(
         )  #Если хотим для кажжого теста вновь тренировать модель
     else:
         #Загружаем ранее тренированную модель с диска
         model = pickle.load(open(env.filename_model_texts(), 'rb'))
     #Предсказываем
     y = model.predict(X)
     return y
Пример #9
0
    def pos_word_by_ml(self, awords):
        env = Environment()
        enc = Word_Encoder()

        file_model = env.filename_model_tree()
        clf = pickle.load(open(file_model, 'rb'))
        a_predict = np.array([enc.word2token('')])
        for word in awords:
            a_padd = [enc.word2token(word)]
            #print(word, a_padd)
            a_predict = np.append(a_predict, a_padd, axis=0)
        a_predict = a_predict[1:]
        #print(a_predict[0, 100])
        predictions = clf.predict(a_predict[:, 0:])
        return (predictions[0:])
Пример #10
0
    def model_prepare_data(self, df, mode='train'):
        env = Environment()
        data = df.copy()
        data.drop(columns=['file', 'idchunk', 'predict'], inplace=True)

        columns = data.columns

        #idstat,idtext,idchunk,idauthor,author,name,file,words_all,words_chunk,sentences_all,sentence_mean,words_uniq,uniq_per_words,NOUN,ADJF,ADJS,COMP,VERB,INFN,PRTF,PRTS,GRND,NUMR,ADVB,NPRO,PRED,PREP,CONJ,PRCL,INTJ

        columns2drop = [
            'idtext', 'idauthor', 'author', 'name', 'sentences_text',
            'words_text', 'sentences_chunk', 'words_chunk', 'words_uniq_chunk'
        ]

        #New features
        #Создадим новые статистические поля для помощи нашей модели
        #data['words_uniq_per_sentense'] = data['words_uniq'] / data['sentences_all'] #кол-во уникальных слов/ кол-во предложений
        #data['words_uniq_3k'] = data['words_uniq'] / 3000  # кол-во уникальных слов на 3 тыс. слов
        #data['words_uniq_10k'] = data['words_uniq'] / 10000 #кол-во уникальных слов на 10 тыс. слов

        y = None
        if mode == 'train':
            y = data['idauthor']
        X = data.drop(columns=columns2drop)

        #Add PCA features
        n_components = 4
        pca_cols2drop = [
            'sentence_mean', 'uniq_per_sent_chunk', 'uniq_per_words_chunk'
        ]
        if mode == 'train':  #формируем матрицу признаков
            pca_pos = PCA(n_components=n_components)
            X_new = pca_pos.fit_transform(X.drop(columns=pca_cols2drop), y)
            print(
                'PCA ratio %s components quality: %s' %
                (n_components,
                 round(np.sum(pca_pos.explained_variance_ratio_), 4)),
                pca_pos.explained_variance_ratio_)
            pickle.dump(pca_pos, open(env.filename_model_texts_pca(), 'wb'))
        if mode == 'test':  #Переводим признаки в пространство признаков на основе ранее созданной матрицы
            pca_pos = pickle.load(open(env.filename_model_texts_pca(), 'rb'))
            X_new = pca_pos.transform(X.drop(columns=pca_cols2drop))
        for i in range(0, n_components):
            X['pca_%s' % i] = X_new[:, i]
        return y, X
Пример #11
0
def main():
    parser = argparse.ArgumentParser(description="RL exercise.")
    ADD = parser.add_argument
    ADD('-e',
        '--environment',
        default='CartPole-v1',
        help="Name of the OpenAI gym environment to train on.")
    ADD('-m',
        '--numEpisodesPerEval',
        type=int,
        default=100,
        help="Number of episodes per policy update iteration.")
    ADD('-n',
        '--numIterations',
        type=int,
        default=1000,
        help="Number of policy updates.")
    ADD('-r',
        '--renderEvery',
        type=int,
        default=0,
        help="Render every nth episode. 0 to disable.")
    ADD('-l',
        '--learningRate',
        type=float,
        default=0.1,
        help="Learning rate of policy update.")
    ADD('-z',
        '--sigma',
        type=float,
        default=0.2,
        help=
        "Standard deviation of policy for continuous actions. Exploration noise."
        )
    ADD('-p',
        '--populationSize',
        type=int,
        default=64,
        help="Population size.")
    args = parser.parse_args()

    # Create the environment, the policy and the training algorithm.
    env = Environment(args.environment, args.numEpisodesPerEval,
                      args.renderEvery)
    policy = DiscretePolicy(env)
    algo = EvolutionStrategies(policy,
                               populationSize=args.populationSize,
                               sigma=args.sigma,
                               learnRate=args.learningRate)

    # Train the policy.
    algo.trainPolicy(policy, env, args.numIterations)
Пример #12
0
 def stat(self):
     env = Environment()
     data = pd.DataFrame()
     file_stat = env.filename_results_csv()
     try:
         data = pd.read_csv(file_stat, index_col='idstat', encoding='utf-8')
     except:
         env.debug(1, ['Failed to read stat file:', file_stat])
     else:
         env.debug(1, ['Read stat file OK:', file_stat])
     #print(data)
     return data
Пример #13
0
 def tokenz(self):
     env = Environment()
     df_tokenz = pd.DataFrame()
     file_tokenz = env.filename_tokenz_csv()
     try:
         df_tokenz = pd.read_csv(file_tokenz,
                                 index_col='idcorpus',
                                 encoding='utf-8')
     except:
         env.debug(1, ['Failed to read tokenz file:', file_tokenz])
     else:
         env.debug(1, ['Read tokenz OK:', file_tokenz])
     return df_tokenz
Пример #14
0
 def authors(self, mode=0):
     env = Environment()
     df = pd.DataFrame()
     filename = env.filename_authors_csv()
     try:
         df = pd.read_csv(filename, index_col='idauthor', encoding='utf-8')
     except:
         env.debug(1, ['Failed to load authors CSV file', filename])
     else:
         env.debug(1, ['Load authors CSV file', filename])
     if mode == 1:
         return df.to_dict().get('name')
     else:
         return df
Пример #15
0
 def corpus_xml2txt(self, num=1, persistent=True):
     result = True
     env = Environment()
     file_xml = env.filename_corpus_xml(num)
     try:
         tree = ET.ElementTree(file=file_xml)
     except:
         env.debug(1, ['Failed to load XML:', file_xml])
         result = False
     else:
         file_txt = env.filename_corpus_txt(num)
         file = open(file_txt, mode='w')
         for elem in tree.iter('source'):
             # print(elem.text, elem.tag, elem.attrib)
             file.write(elem.text)
             file.write(' ')
         file.close()
         env.debug(1, ['Write corpus file to TXT:', file_txt])
     return result
Пример #16
0
 def grammemes(self, mode=0):
     env = Environment()
     dfgram = pd.DataFrame()
     filename_gram = env.filename_grammemes_csv()
     try:
         dfgram = pd.read_csv(filename_gram,
                              index_col='idgram',
                              encoding='utf-8')
     except:
         env.debug(1, ['Failed to load grammemes CSV file', filename_gram])
     else:
         env.debug(1, ['Load grammemes CSV file', filename_gram])
     if mode == 1:
         return dfgram.to_dict().get('name')
     else:
         return dfgram
Пример #17
0
def main():
    parser = argparse.ArgumentParser(description="RL exercise.")
    ADD = parser.add_argument
    ADD('-e',
        '--environment',
        default='CartPole-v1',
        help="Name of the OpenAI gym environment to train on.")
    ADD('-m',
        '--numEpisodesPerEval',
        type=int,
        default=100,
        help="Number of episodes per policy update iteration.")
    ADD('-n',
        '--numIterations',
        type=int,
        default=1000,
        help="Number of policy updates.")
    ADD('-l',
        '--learningRate',
        type=float,
        default=0.1,
        help="Learning rate of policy update.")
    ADD('-g',
        '--gamma',
        type=float,
        default=0.99,
        help="Rewards discount factor.")
    ADD('-r',
        '--renderEvery',
        type=int,
        default=0,
        help="Render every nth episode. 0 to disable.")
    args = parser.parse_args()

    # Create the environment, the policy and the training algorithm.
    env = Environment(args.environment, args.numEpisodesPerEval,
                      args.renderEvery)
    policy = DiscretePolicy(env)
    algo = ReinforceAlgorithm(policy,
                              gamma=args.gamma,
                              learnRate=args.learningRate)

    # Train the policy.
    algo.trainPolicy(policy, env, args.numIterations)
Пример #18
0
def main():
    pd.set_option("display.max_columns", 100)
    pd.set_option('display.width', 1000)

    #Служебные классы
    env = Environment()
    c = OpenCorpus()
    t = POSTagger()
    a = mlAnalyzer()
    enc = Word_Encoder()
    g = pd.DataFrame()
    g = c.grammemes()
    dg = c.grammemes(
        mode=1
    )  #Справочник Части речи mode = 1 возвращает в виде словаря python
    da = c.authors(mode=1)  #Справочник - авторы

    #Пример обработки текстов из texts_train и добавления статистической информации в results
    #a_texts_train = [1, 16]
    #a_texts_train = [48]
    #for i in a_texts_train:
    #    a.process_from_texts_file([i])

    #Пример визуализации статистической информации из results в 2-мерном пространстве
    #a.vizualize2d()

    #Пример визуализации статистичесокй информации о частях речи
    #t.vizualize2d(n_frac = 0.001)

    #Предсказание автора текста из text_test
    #[0, 1, 2, 3, 4]) #предсказать все тексты - долго
    text2predict = [3]
    y = a.predict(text2predict)  #предсказать - указать номер текста
    j = 0
    for i in y:
        print('idtext=%s' % text2predict[j], da.get(i))
        j = j + 1
Пример #19
0
 def tokenz_create_stat(self, dftokenz=pd.DataFrame(), n_frac=1):
     env = Environment()
     enc = Word_Encoder()
     di_letters = Environment.di_bgm_byletters
     bgm_columns = env.bgm_columns_list(mode=1)
     t_start = timer()
     if dftokenz.empty:
         dftokenz = self.tokenz()
     if n_frac < 1:
         dftokenz = dftokenz.sample(frac=n_frac)
     env.debug(1, [
         'POStagger', 'create_stat',
         'Collecting statistic START %s words' % dftokenz.shape[0]
     ])
     di_tokenz_stat = (dftokenz.count()).to_dict()
     di_tokenz_res = {}
     #print('di_letters', di_letters)
     print('di_tokenz_stat', di_tokenz_stat)
     bgm_astat = [['init', 0]]
     bgm_index = []
     for key in di_letters:
         di_n = di_letters.get(key)
         column_stat = di_tokenz_stat.get(bgm_columns[di_n])
         #di_tokenz_res[key] = column_stat
         bgm_astat.append([key, column_stat])
         bgm_index.append(di_n)
     bgm_astat = bgm_astat[1:]
     print('column stat', bgm_astat)
     df_bgm_stat = pd.DataFrame(data=bgm_astat,
                                columns=['bigram', 'counts'],
                                index=bgm_index)
     df_bgm_stat.index.name = 'idbigram'
     df_bgm_stat = df_bgm_stat.sort_values(by=['counts'], ascending=False)
     print('bgm_stat\n', df_bgm_stat)
     df_bgm_stat.to_csv(env.filename_stat_bigram_letters_csv(),
                        encoding='utf-8')
Пример #20
0
    def vizualize2d(self, n_frac=0.01, b_annotations=False):
        n_components = 2
        env = Environment()
        c = OpenCorpus()
        di_g = c.grammemes(mode=1)
        data = self.tokenz().sample(frac=n_frac)

        data = data.fillna(0)
        #print(data['idgram'].shape)
        #print(data.index.shape)
        tdf = pd.DataFrame(index=data.index)
        tdf['idgram'] = data['idgram']
        tdf['gram'] = data['gram']
        tdf['word'] = data['word']
        #print(tdf)

        drop_columns = [
            'word', 'gram', 's_suffix2', 's_suffix3', 's_prefix2', 's_prefix3',
            'n_token'
        ]  # , 'bgm_l_None'
        # drop_columns.extend(['bgm_l_%s' % (i) for i in range(1, env.bgm_columns_max()) if 'bgm_l_%s' % (i) not in bgm_columns])
        env.debug(
            1,
            ['POStagger', 'visualize2D',
             'Drop colums: %s' % (drop_columns)])
        data = data.drop(columns=drop_columns, axis=1)
        values = data.values
        X = values[:, 1:]
        y = values[:, 0]
        #print(data.head,X, y)
        #return 0

        #Scalers
        sc = StandardScaler()
        min_max_scaler = preprocessing.MinMaxScaler()
        max_abs_scaler = preprocessing.MaxAbsScaler()
        #X = sc.fit_transform(X)

        #PCA
        b_pca = False
        b_sne = True
        if b_pca:
            model = PCA(n_components=n_components)
        if b_sne:
            model = MDS(n_components=n_components)  #TSNE
        X_new = model.fit_transform(X, y)
        if b_pca:
            print('PCA ratio', n_components, 'components',
                  model.explained_variance_ratio_)
        #X_new = sc.fit_transform(X_new)
        #X_new = preprocessing.scale(X_new)
        if b_pca:
            X_new = max_abs_scaler.fit_transform(X_new)
        #return 0

        #tdf = pd.DataFrame(data=X_new, columns=['PC1', 'PC2'], index=data.index)
        tdf['PC1'] = X_new[:, 0]
        tdf['PC2'] = X_new[:, 1]
        #finalDf = pd.concat([tdf, data[['idgram']]], axis=1)
        df_groups = tdf.groupby('idgram').count()
        #print(df_groups)
        #return 0
        tdf['counts'] = 0
        for index, serie in tdf.iterrows():
            n_idgram = tdf.at[index, 'idgram']
            tdf.at[index,
                   'counts'] = df_groups[df_groups.index == n_idgram]['gram']
        tdf = tdf.sort_values(by=['counts'], ascending=False)
        #print(tdf)

        #Draw
        i = 0
        N = df_groups.shape[0]
        s_title = ''
        if b_pca:
            s_title = '2 component PCA. Точность %s' % (round(
                sum(float(i) for i in model.explained_variance_ratio_), 2))
        if b_sne:
            s_title = 't-SNE'

        #Plotly
        if False:  #Plotly
            py.sign_in('shashmaxus', 'AdfwTulrOoV3cSlbZT3B')
            c = [
                'hsl(' + str(h) + ',50%' + ',50%)'
                for h in np.linspace(0, 360, N)
            ]
            data_trace = []
            for index, row in df_groups.iterrows():
                #print(index)
                df_trace = tdf[tdf['idgram'] == index]
                #print(df_trace)
                g_trace = go.Scatter(
                    x=df_trace['PC1'].values,
                    y=df_trace['PC2'].values,
                    name=df_trace['gram'].values[0],
                    mode='markers',  #'markers+text'
                    marker=dict(
                        size=8,
                        color=i,  #c[i]
                        opacity=0.8,
                        colorscale='Viridis'),
                    text=df_trace['word'],
                    textfont=dict(family='sans serif', size=12))
                data_trace.append(g_trace)
                i += 1
            layout = go.Layout(
                title=s_title_pca,
                xaxis=dict(
                    title=('Component 1. Вклад %s' %
                           (round(pca.explained_variance_ratio_[0], 2)))),
                yaxis=dict(
                    title=('Component 2. Вклад %s' %
                           (round(pca.explained_variance_ratio_[1], 2)))))
            fig2 = go.Figure(data=data_trace, layout=layout)
            py.image.save_as(fig2,
                             filename='c:/prj/mlivos_data/temp/Words2.png')

        #Bokeh
        if True:
            palette = d3['Category20'][len(tdf['gram'].unique())]
            #palette = all_palettes['Category20'][len(tdf['gram'].unique())]
            #palette = Viridis256[len(tdf['gram'].unique())]
            #palette = Viridis256
            color_map = CategoricalColorMapper(factors=tdf['gram'].unique(),
                                               palette=palette)
            #print(mapper)
            fig = figure(title=s_title, toolbar_location=None)
            source = ColumnDataSource(tdf[['gram', 'PC1', 'PC2']])
            fig.scatter(x='PC1',
                        y='PC2',
                        size=12,
                        color={
                            'field': 'gram',
                            'transform': color_map
                        },
                        legend='gram',
                        source=source)
            show(fig)
            export_png(fig, filename="c:/prj/mlivos_data/temp/PCA.png")
        return 0
Пример #21
0
    def predict(self, aidtext, b_makestat=False):
        env = Environment()

        # Открываем файл со статистикой по тестовым текстам
        df_stat = pd.read_csv(
            env.filename_stat_test_csv(), index_col='idstat',
            encoding='utf-8')  # Статистика по тстовым текстам

        df_texts = pd.read_csv(env.filename_predict_csv(),
                               index_col='idtext',
                               encoding='utf-8')  # Реестр текстов
        mask = df_texts.index.isin(aidtext)
        df_texts = df_texts[mask]

        columns = ['idtext', 'idchunk', 'idauthor', 'author', 'name', 'file', \
                   'sentences_text', 'words_text','sentence_mean', \
                   'sentences_chunk', 'words_chunk',
                   'words_uniq_chunk','uniq_per_sent_chunk','uniq_per_words_chunk', \
                  'NOUN','ADJF','ADJS','COMP','VERB','INFN','PRTF','PRTS','GRND','NUMR',\
                  'ADVB','NPRO','PRED','PREP','CONJ','PRCL','INTJ', 'predict']
        y_result = []

        #Если необходимо подготовить статистику по тестовым текстам
        if b_makestat:
            for index, row in df_texts.iterrows(
            ):  # Для каждого текста, который надо обработать
                file_txt = df_texts.at[index, 'filename']
                # Read text file
                env.debug(1,
                          ['Analyzer', 'predict', 'START file TXT:', file_txt])
                t_start = timer()
                file = codecs.open(file_txt, "r", "utf_8_sig")
                text = file.read().strip()
                file.close()
                # Автор в тестовой выборке вообще говоря нет
                idauthor = df_texts.at[index, 'idauthor']  # Автор
                #idauthor = 0
                name = df_texts.at[index, 'name']  # Название

                # Собственно обработка текста
                df_add = self.analyze_text(
                    columns, text, index, idauthor, name,
                    file_txt)  # Analyze text, get Series
                #print(df_add)
                df_add.reset_index(drop=True, inplace=True)
                df_stat = df_stat.append(
                    df_add, ignore_index=True)  #Добавляем к файлу результатов
                df_stat.reset_index(drop=True, inplace=True)
                df_stat.index.name = 'idstat'
                t_end = timer()
                env.debug(1, [
                    'END file TXT:', file_txt, 'time:',
                    env.job_time(t_start, t_end)
                ])
            #df_stat теперь содержит информацию о всех тестовых текстах, которые хотели обработать
            #Указываем верный тип для целочисленных колонок
            int_cols = [
                'idtext', 'idchunk', 'idauthor', 'sentences_text',
                'words_text', 'sentences_chunk', 'words_chunk',
                'words_uniq_chunk'
            ]
            for col in int_cols:
                df_stat[col] = df_stat[col].astype(int)
            # Сохраняем результат на диск
            df_stat.to_csv(env.filename_stat_test_csv(), encoding='utf-8')
        #Статистика готова

        # Открываем файл со статистикой по тестовым текстам
        df_stat = pd.read_csv(
            env.filename_stat_test_csv(), index_col='idstat',
            encoding='utf-8')  # Статистика по тстовым текстам
        #mask = df_stat.index.isin(aidtext)
        #df_stat2predict = df_stat[mask]
        #Предсказываем авторов
        y_res = self.model_predict(df_stat.loc[aidtext])
        #print(y_res)
        df_stat.loc[aidtext, 'predict'] = y_res.astype(int)
        #print(df_stat)
        #y_result.append(y_res[0])
        #Сохраняем измененный файл с предсказаниями
        df_stat.to_csv(env.filename_stat_test_csv(), encoding='utf-8')
        return y_res  #Возвращаем предсказания
Пример #22
0
    def vizualize2d(self, mode='train'):
        n_components = 2
        env = Environment()
        data = self.get_texts_stat(mode=mode)
        columns = data.columns
        #print(data)
        #print(columns)
        columns2drop = [
            'idtext', 'idauthor', 'author', 'name', 'sentences_text',
            'words_text', 'sentence_mean', 'sentences_chunk', 'words_chunk',
            'words_uniq_chunk', 'uniq_per_sent_chunk', 'predict', 'shortname',
            'name_author'
        ]

        y = data['idauthor'].values
        X = data.drop(columns=columns2drop).values
        #print(y, X)
        #return 0

        #print(data)
        #print(X, y)
        pca = PCA(n_components=n_components)
        #pca = TSNE(n_components=2)
        X_new = pca.fit_transform(X, y)
        print('PCA ratio 2 components', pca.explained_variance_ratio_)
        #print('components', pca.components_)
        #print(X_new)
        tdf = pd.DataFrame(data=X_new, columns=['PC1', 'PC2'])
        finalDf = pd.concat([tdf, data[['idauthor', 'name', 'shortname']]],
                            axis=1)
        print('dataframe ', finalDf)

        mpl.style.use('default')

        rcParams['font.family'] = 'sans-serif'
        rcParams['font.sans-serif'] = ['Tahoma']

        fig = plt.figure(figsize=(8, 8))
        ax = fig.add_subplot(1, 1, 1)
        ax.set_xlabel('Component 1. Вклад ' +
                      str(round(pca.explained_variance_ratio_[0], 2)),
                      fontsize=12)
        ax.set_ylabel('Component 2. Вклад ' +
                      str(round(pca.explained_variance_ratio_[1], 2)),
                      fontsize=12)
        ax.set_title(
            '2 component PCA. Точность ' +
            str(round(sum(float(i)
                          for i in pca.explained_variance_ratio_), 2)),
            fontsize=12)
        targets = data.idauthor.unique()
        print(targets)
        legends = data.shortname.unique()
        print(legends)
        #print(targets)
        #colors = ['r', 'g', 'b']
        #colors = "bgcmykw" #without r
        #colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728',
        #              '#9467bd', '#8c564b', '#e377c2', '#7f7f7f',
        #              '#bcbd22', '#17becf']
        colors = [
            "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c", "#98df8a",
            "#d62728", "#ff9896", "#9467bd", "#c5b0d5", "#8c564b", "#c49c94",
            "#e377c2", "#f7b6d2", "#7f7f7f", "#c7c7c7", "#bcbd22", "#dbdb8d",
            "#17becf", "#9edae5"
        ]
        for target in targets:
            indicesToKeep = finalDf['idauthor'] == target
            ax.scatter(finalDf.loc[indicesToKeep, 'PC1'],
                       finalDf.loc[indicesToKeep, 'PC2'],
                       c=colors[target],
                       s=50)
        for index, row in finalDf.iterrows():
            ax.annotate(
                finalDf.at[index, 'name'],
                xy=(finalDf.at[index, 'PC1'], finalDf.at[index, 'PC2']),
                #                        xytext=(0.05, 0.05),
                fontsize=8)
        ax.legend(legends)
        ax.grid()
        plt.show()
Пример #23
0
 def vocabulary(self):
     env = Environment()
     file_voc = env.filename_vocabulary_csv()  #from vocabulary file
     file_dict = env.filename_dict_csv()  #from dictionary file
     try:
         df_voc = pd.read_csv(file_voc,
                              index_col='idcorpus',
                              encoding='utf-8')
     except:
         env.debug(1, ['Failed to read vocabulary file:', file_voc])
     else:
         env.debug(1, ['Read vocabulary OK:', file_voc])
     try:
         df_dict = pd.read_csv(file_dict,
                               index_col='idcorpus',
                               encoding='utf-8')
     except:
         env.debug(1, ['Failed to read dictionary file:', file_dict])
     else:
         env.debug(1, ['Read dictionary OK:', file_dict])
     #Concat
     df_res = pd.concat([df_voc, df_dict])
     df_res = df_res.drop_duplicates()
     #Apply patch words
     df_patch = pd.read_csv(env.filename_vocabulary_patch_csv(),
                            index_col='idcorpus',
                            encoding='utf-8')
     df_res = df_res.drop(df_res[df_res['word'].isin(
         df_patch['word'])].index,
                          axis=0)
     df_res = pd.concat([df_res, df_patch])
     #print(df_res[df_res['word'].isin(df_patch['word'])])
     df_res = df_res.reset_index(drop=True)
     df_res.index.name = 'idcorpus'
     #print(df_res)
     return df_res
Пример #24
0
    def make_report(self):
        env = Environment()
        a = mlAnalyzer()
        #template = Template('Hello {{ name }}!')
        #print(template.render(name=u'Вася'))
        jenv = jinjaEnvironment(loader = FileSystemLoader(env.path_templates()))
        #print(jenv.loader)
        template = jenv.get_template("report_global.tpl.html")

        data = a.get_texts_stat() #Статистика по текстам из файла обучающей выборки
        test = a.get_texts_stat(mode='test') #Статистика по текстам тестовой выборки
        #print(data)
        #test['predict'] = test['predict'].astype(int)
        test['validation'] = 0
        test.loc[test.idauthor == test.predict,'validation'] = 1
        print(data)
        print(test)

        #Summary stat
        group = pd.merge(data, test, on='idauthor', how='left', suffixes=('', '_test'))
        print(group)
        group = group.groupby(['idauthor', 'name_author'], as_index=False).agg({'idtext' : ['nunique'],
                                                                'words_chunk' : ['sum'],
                                                                'name_test': ['nunique'],
                                                                'words_chunk_test': ['sum'],
                                                                'validation' : ['mean']
                                                                    })


        print(group)
        group.drop(['idauthor'], axis=1, inplace=True)
        group.sort_values('name_author', inplace=True)
        #Переименовать колонки в русскоязычные
        group.columns = ['Писатель',
                         'Кол-во текстов для обучения',
                         'Объём текстов для обучения (кол-во слов)',
                         'Кол-во текстов для проверки',
                         'Объём текстов для проверки (кол-во слов)',
                         'Точность определения'
                         ]
        n_accuracy = group['Точность определения'].mean()
        #Целые числа показываем без дробной части
        int_cols = ['Кол-во текстов для обучения',
                         'Объём текстов для обучения (кол-во слов)',
                         'Кол-во текстов для проверки',
                         'Объём текстов для проверки (кол-во слов)']
        for col in int_cols:
            group[col] = group[col].astype(int)
        group.reset_index(drop = True, inplace = True)
        s = group.style.set_properties(**{'text-align': 'right'})
        group.fillna('', inplace = True)
        s.hide_index().render()

        #Training stat
        group_train =  data.groupby(['author'], as_index=False).agg({'idauthor' : ['count'],
                                                                'sentences_text' : ['sum'],
                                                                'words_text' : ['sum'],
                                                                'sentence_mean': ['mean'],
                                                                'name': [lambda col: '<br />'.join(col)],
                                                                    })
        group_train.reset_index(drop = True, inplace = True)
        s_train = group_train.style.set_properties(**{'text-align': 'right'})
        group_train.fillna('', inplace=True)
        group_train.columns = ['Писатель',
                               'Кол-во текстов',
                               'Кол-во предложений',
                               'Кол-во слов',
                               'Средняя длина предложения',
                               'Произведения'
                         ]
        n_train = group_train['Кол-во текстов'].sum()
        s_train.hide_index().render()

        # Testing stat
        group_test = test.groupby(['author'], as_index=False).agg({'idauthor': ['count'],
                                                                    'sentences_text': ['sum'],
                                                                    'words_text': ['sum'],
                                                                    'sentence_mean': ['mean'],
                                                                    'name': [lambda col: '<br />'.join(col)],
                                                                    'validation': ['mean'],
                                                                    'shortname_predict': [lambda col: '<br />'.join(col)],
                                                                    })
        group_test.reset_index(drop=True, inplace=True)
        s_test = group_test.style.set_properties(**{'text-align': 'right'})
        group_test.fillna('', inplace=True)
        group_test.columns = ['Писатель',
                               'Кол-во текстов',
                               'Кол-во предложений',
                               'Кол-во слов',
                               'Средняя длина предложения',
                               'Произведения',
                               'Результат проверки',
                               'Определён автор',
                               ]
        n_test = group_test['Кол-во текстов'].sum()
        s_test.hide_index().render()

        template_vars = {"title": "Отчёт",
                         "detection_accuracy": '%s' % (round(n_accuracy,4)*100),
                         "train_texts_pivot_table_style_render" : s.render(),
                         "n_train_texts": round(n_train,0),
                         "train_texts_table_style_render": s_train.render(),
                         "n_test_texts": round(n_test,0),
                         "test_texts_table_style_render": s_test.render()
                         }
        html_out = template.render(template_vars)

        file = codecs.open(env.filename_global_report_html(), "w", "utf-8-sig")
        file.write(html_out)
        file.close()
        #print(html_out)
        return html_out
Пример #25
0
 def test(self, n_min=1, n_max=1):
     t_start = timer()
     env = Environment()
     df_test = pd.DataFrame()
     for i in range(n_min, n_max + 1):
         try:
             dffile = pd.read_csv(env.filename_corpus_csv(i),
                                  index_col='idcorpus',
                                  encoding='utf-8')
         except:
             env.debug(1, [
                 'POStagger', 'test', 'Failed to read corpus file:',
                 env.filename_corpus_csv(i)
             ])
         else:
             env.debug(1, [
                 'POStagger', 'test', 'Read OK:',
                 env.filename_corpus_csv(i)
             ])
             if not dffile.empty:
                 df_test = df_test.append(dffile)
     df_test = df_test.drop_duplicates()
     df_test.columns = ['word', 'gram', 'idgram']
     df_test = df_test.reset_index(drop=True)
     df_test.index.name = 'idcorpus'
     df_test['gram_valid'] = df_test['gram']
     n_testsize = df_test.shape[0]
     env.debug(1, ['POStagger', 'test', 'START %s words' % n_testsize])
     df_test = self.pos(df_test)
     print('Test result', df_test)
     df_err = df_test[df_test['gram_valid'] != df_test['gram']]
     print('Test errors:', df_err)
     df_err.to_csv(env.filename_test_err_csv(), encoding='utf-8')
     env.debug(1, [
         'POStagger', 'test',
         'test accuracy %s' % (1 - df_err.shape[0] / n_testsize)
     ])
     t_end = timer()
     env.debug(1, [
         'POSTagger', 'test', 'test time:',
         env.job_time(t_start, t_end), 'sec.'
     ])
Пример #26
0
 def corpus_xml2csv(self, num=1, persistent=True):
     env = Environment()
     file_xml = env.filename_corpus_xml(num)
     df_xml = pd.DataFrame()
     df_gram = self.grammemes()
     dgram = df_gram.to_dict().get('name')
     try:
         tree = ET.ElementTree(file=file_xml)
     except:
         env.debug(1, ['Failed to load XML:', file_xml])
     else:
         t_start = timer()
         env.debug(1, ['CORPUS', 'XML to CSV:', file_xml])
         for elem in tree.iter('token'):
             #print(elem.tag, elem.attrib)
             serie = pd.Series(data=[])
             badd = False
             s_text = elem.attrib.get('text')
             serie[len(serie)] = s_text.lower()
             for elem2 in elem.iter('g'):
                 #print(elem2.tag, elem2.attrib)
                 sgram = elem2.attrib.get('v')
                 sgram = sgram.upper()
                 if (df_gram[df_gram['name'].isin([sgram]) == True].size
                     ) > 0:
                     serie[len(serie)] = sgram
                     serie[len(serie)] = int(df_gram.index[
                         df_gram['name'] == sgram].tolist()[0])
                     #serie[len(serie)] = list(dgram.keys())[list(dgram.values()).index(sgram)]
                     badd = True
                 break
             #print(s)
             if badd:
                 df_xml = df_xml.append(serie, ignore_index=True)
         if not df_xml.empty:
             df_xml = df_xml.drop_duplicates()
             df_xml = df_xml.reset_index(drop=True)
             df_xml.index.name = 'idcorpus'
             df_xml.columns = ['word', 'gram', 'idgram']
             df_xml = df_xml.astype({"idgram": int})
             if persistent:
                 file_csv = env.filename_corpus_csv(num)
                 env.debug(1, ['Write corpus file to CSV:', file_csv])
                 df_xml.to_csv(file_csv, encoding='utf-8')
                 t_end = timer()
                 env.debug(1, [
                     'CORPUS', 'CSV written:', file_csv,
                     'takes %s sec.' % env.job_time(t_start, t_end)
                 ])
     return df_xml
Пример #27
0
    def train(self,
              df=pd.DataFrame(),
              validation='eval',
              n_splits=5,
              b_smoketest=True,
              n_frac=1):
        env = Environment()
        enc = Word_Encoder()
        df_train = df
        bgm_columns = env.bgm_columns_list(mode=1)
        drop_columns = [
            'word', 'gram', 's_suffix2', 's_suffix3', 's_prefix2', 's_prefix3',
            'n_token'
        ]  #, 'bgm_l_None'
        #drop_columns.extend(['bgm_l_%s' % (i) for i in range(1, env.bgm_columns_max()) if 'bgm_l_%s' % (i) not in bgm_columns])
        env.debug(1,
                  ['POStagger', 'train',
                   'Drop colums: %s' % (drop_columns)])

        if df_train.empty:
            t_start = timer()
            df_train = self.tokenz()
            t_end = timer()
            env.debug(1, [
                'POSTagger', 'train', 'tokenz loaded:', 'time:',
                env.job_time(t_start, t_end)
            ])

        env.debug(1, [
            'POStagger', 'train',
            'All tokenz set shape %s' % df_train.shape[0]
        ])
        t_start = timer()
        env.debug(1, ['POStagger', 'train', 'Learning: START'])
        if n_frac < 1:
            df_train = df_train.sample(frac=n_frac)
            env.debug(1, [
                'POStagger', 'train',
                'Training tokenz set shape %s' % df_train.shape[0]
            ])
            #print(df_train.shape)

        #df_train2 = df_train[bgm_columns]
        #print(df_train2.shape)
        #df_train2 = df_train2.astype({"idgram": int})
        df_train = df_train.drop(columns=drop_columns, axis=1)
        env.debug(
            1, ['POStagger',
                'Train colums: %s' % (df_train.columns.tolist())])
        #print(df_train.columns)

        #df_train = df_train.drop_duplicates() #slow-slow
        #print(df_train.head())

        df_train = df_train.fillna(0)
        file_x = env.filename_xtrain_csv()
        df_train.to_csv(file_x, encoding='utf-8')
        env.debug(1, ['POStagger', 'train', 'Save X', file_x])
        y = df_train['idgram'].values
        df_train.drop(columns=['idgram'], inplace=True)
        X = df_train.values
        #array = df_train.values
        #print(df_train)
        #X = array[:, 1:]
        #Y = array[:, 0]

        #print(X, Y)
        #validation_size = 0.20
        seed = 241
        frac_test_size = 0.2

        sc = StandardScaler()
        #Y_sc = sc.fit_transform(Y)
        t2_start = timer()
        if validation == 'cv':  #Need cross-validation
            scoring = 'accuracy'
            # scoring = 'f1_samples'
            kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
            if True:  #Decision tree
                env.debug(1, ['Tree cross-validation'])
                # clf = DecisionTreeClassifier(criterion='gini', random_state=seed)  # 0.79
                # clf = KNeighborsClassifier(n_neighbors=230)
                model = DecisionTreeClassifier(criterion='entropy',
                                               random_state=seed)  # 0.81
                env.debug(
                    1, ['Calculate cross_val_score. Splits=%s' % (n_splits)])
                scores = cross_val_score(model, X, y, cv=kf)
                print('DTree scores:', scores.mean(), 'raw', scores)

            if False:  #Logistic regression
                env.debug(1, ['LGR cross-validation'])
                n_Cs = [0.01]
                X = array[:, 5:]
                X_sc = sc.fit_transform(X)
                Y = df_train['idgram'].values
                Y[Y > 0] = 1
                print(X_sc, Y)
                for n_c in n_Cs:
                    #clf = LogisticRegression(penalty='l2', solver='saga', C=n_c, multi_class='multinomial')
                    clf = LogisticRegression(penalty='l2',
                                             solver='liblinear',
                                             C=n_c)
                    # clf = SVC(kernel='linear', C=10000, random_state=241)
                    # clf = SVC(kernel='linear', C=0.01, random_state=seed)
                    # clf = SVC(random_state=seed)
                    # clf = Perceptron()
                    env.debug(1, [
                        'Calculate cross_val_score. Splits=%s C=%s' %
                        (n_splits, n_c)
                    ])
                    scores = cross_val_score(clf, X_sc, Y, cv=kf)
                    print(scores)

            if False:  #GBM, RandomForest
                env.debug(1, ['GBM cross-validation'])
                asteps = [20]  #GBM
                #asteps=[100] #RandomForest
                for i in asteps:
                    #clf = RandomForestClassifier(n_estimators=i)
                    clf = GradientBoostingClassifier(
                        n_estimators=i, max_depth=8)  #, max_features='sqrt'
                    env.debug(1, [
                        'Calculate cross_val_score. Splits=%s Estimators=%s' %
                        (n_splits, i)
                    ])
                    scores = cross_val_score(clf, X, Y, cv=kf)
                    print(scores)

        if validation == 'eval':
            # eval
            model = xgb.XGBClassifier(n_estimators=140,
                                      max_depth=16,
                                      colsample=1,
                                      subsample=0.5,
                                      seed=seed)
            X_train, X_test, y_train, y_test = train_test_split(
                X,
                y,
                test_size=frac_test_size,
                random_state=seed,
                shuffle=True)
            eval_set = [(X_train, y_train), (X_test, y_test)]
            # print(eval_set)
            f_eval = 'merror'
            # f_eval = 'mlogloss'
            model.fit(X_train,
                      y_train,
                      eval_metric=f_eval,
                      eval_set=eval_set,
                      verbose=False,
                      early_stopping_rounds=20)
            ev_scores = model.evals_result()
            ev_mean = np.array(ev_scores['validation_0'][f_eval]).mean()
            #print(model.feature_importances_)
            print(ev_mean, ev_scores)
            xgb.plot_importance(model)
            plt.show()
        t2_end = timer()
        t_end = timer()
        env.debug(1, ['CV completed:', 'time:', env.job_time(t_start, t_end)])

        if validation == 'cv':
            #Training на всех данных
            X_train, y_train = X, y

            # model = SVC()
            # model= DecisionTreeClassifier() #79
            # model= LinearDiscriminantAnalysis() #47
            # model=LogisticRegression() #48
            # model = KNeighborsClassifier(n_neighbors=200) #48
            # model = GaussianNB()   #43
            #print('Fit...')

            #print('Validate...')
            # predictions = model.predict(X_validation)

            # print(accuracy_score(Y_validation, predictions))
            # print(confusion_matrix(Y_validation, predictions))
            # print(classification_report(Y_validation, predictions))

            t_start = timer()
            env.debug(1, ['Training: START'])
            model.fit(X_train, y_train)
            t_end = timer()
            env.debug(1, ['Training: END', env.job_time(t_start, t_end)])

        pickle.dump(sc, open(env.filename_scaler(), 'wb'))
        pickle.dump(model, open(env.filename_model_tree(), 'wb'))

        # Smoke test
        if b_smoketest:
            X_smoke_predict = [
                'съеште', 'ещё', 'этих', 'мягких', 'французских', 'булок'
            ]
            a_smoke = np.array(
                [enc.word2token(elem) for elem in X_smoke_predict])
            y_predictions = model.predict(a_smoke[:, 0:])
            y_predictions_proba = model.predict(a_smoke[:, 0:])
            #print(y_predictions)
            print('Prediction', list(zip(X_smoke_predict, y_predictions)))
            print('Proba', list(zip(X_smoke_predict, y_predictions_proba)))
        return model
Пример #28
0
 def dict_xml2csv(self, persistent=True, lines=10000):
     t_start = timer()
     env = Environment()
     dfgram = self.grammemes()
     filename_dict = env.filename_dict_xml()
     dfcols = ['word', 'gram', 'idgram']
     df_xml = pd.DataFrame(columns=dfcols)
     env.debug(
         1, ['CORPUS', 'Start to load dictionary from XML:', filename_dict])
     try:
         fp = io.open(filename_dict, mode="r", encoding="utf-8")
     except:
         env.debug(1, [
             'CORPUS', 'Failed to open dictionary file XML:', filename_dict
         ])
     else:
         number_lines = sum(1 for line in fp)
         fp.seek(0)
         t_end = timer()
         env.debug(1, [
             'CORPUS', 'File opened:', 'lines',
             '%s' % number_lines, 'time:',
             env.job_time(t_start, t_end)
         ])
         t_start = timer()
         step = number_lines // lines
         env.debug(1, [
             'CORPUS', 'Read dictionary:', filename_dict,
             'lines: %s step %s' % (lines, step)
         ])
         n_line = 0
         for i in range(0, number_lines):
             line = fp.readline()
             #print(line[5:10])
             if (line[5:10] == 'lemma') and (n_line == 0):
                 #print(line)
                 tree = ET.fromstring(line)
                 for elem in tree.iter('l'):
                     s_word = elem.attrib.get('t')
                     gram = ['', 0]
                     j = 0
                     for elem2 in elem.iter('g'):
                         gram[j] = elem2.attrib.get('v')
                         break
                     gram[1] = int(dfgram.index[dfgram['name'] ==
                                                gram[0]].tolist()[0])
                 #print(s_word,gram)
                 s = pd.Series(data=[s_word, gram[0], gram[1]],
                               index=dfcols)
                 df_xml = df_xml.append(s, ignore_index=True)
                 n_line += 1
             n_line += 1
             if n_line >= step:
                 n_line = 0
         fp.close()
         df_xml.index.name = 'idcorpus'
         t_end = timer()
         env.debug(1, [
             'CORPUS', 'Dictionary loaded:', 'time:',
             env.job_time(t_start, t_end)
         ])
         if persistent:
             filename_csv = env.filename_dict_csv()
             env.debug(1,
                       ['CORPUS', 'Write dictionary to CSV:', filename_csv])
             df_xml.to_csv(filename_csv, encoding='utf-8')
             env.debug(1, ['CORPUS', 'Dictionary saved:', filename_csv])
     return df_xml
Пример #29
0
def main():
    pd.set_option("display.max_columns", 100)
    pd.set_option('display.width', 1000)

    env = Environment()
    c = OpenCorpus()
    t = POSTagger()
    a = mlAnalyzer()
    enc = Word_Encoder()
    r = Reporter()
    #c.dict_xml2csv(lines = 600000)
    #c.grammemes_xml2csv()
    #c.vocabulary_from_corpus(1,1000)
    g = pd.DataFrame()
    g = c.grammemes()
    #dg = g.to_dict().get('name')
    dg = c.grammemes(mode=1)  #grammemes by id
    da = c.authors(mode=1)  # authors by id

    #print(dg)
    #print(p.head())
    #for i in range(2015,3000):
    #    c.corpus_xml2csv(i)
    #c.corpus_xml2csv(2)
    #for i in range (125,150):
    #    c.corpus_xml2txt(i)
    #print(c.vocabulary_from_corpus(1,2000).head())
    #voc=c.vocabulary()
    #print(voc.head())
    #t.tokenize()
    #print(t.tokenize(voc, n_frac=1))
    #t.tokenz_create_stat()
    #print(env.bgm_stat())
    #print(t.tokenz())
    #print(c.vocabulary())
    #print(enc.word2token('паровоз'))
    #print(enc.word2token('аз'))
    #t.train(n_frac=0.8, validation='cv')
    #t.train(n_frac=0.95, validation='eval')
    #t.test(2000,2048)
    #a.process_from_texts_file([49], mode='chunk_size')
    #a.process_from_texts_file([58], max_words = 8000)
    #arrt = [2, 45, 43, 44, 42, 40, 41, 46, 36, 37, 38, 34]
    #arrt = [69]
    #for i in range (51,95):
    #for i in arrt:
    #a.process_from_texts_file([i], max_words = 8000)
    #t.vizualize2d(n_frac=0.01)
    #nltk.download()
    #a.vizualize2d()
    #a.vizualize2d(mode='train')
    #a.vizualize2d(mode='test')
    #a.model_train()
    #return 0
    #y = a.predict([0, 1, 2, 3, 4])
    #y = a.predict([0, 1, 2, 3, 4])
    print(a.predict([16], b_makestat=True))
    a.vizualize2d(mode='test')
    #for i in y:
    #    print('idtext=%s' % i, da.get(i))
    #text2predict = [11, 12, 13, 14, 15]
    #y = a.predict(text2predict, b_makestat=True)  # предсказать - указать номер текста
    #j = 0
    #for i in y:
    #    print('idtext=%s' % text2predict[j], 'Автор=%s (%s)' % (i, da.get(i)))
    #    j = j + 1

    #predict=(t.pos_word_by_voc(['съеште', 'школа','господина','приехал',
    #                       'глокая','куздра','штеко','будланула','бокра','и','кудрячит','бокрёнка']))

    X_predict = [
        'съеште', 'школа', 'господина', 'приехал', 'глокая', 'куздра', 'штеко',
        'будланула', 'бокра', 'и', 'кудрячит', 'бокрёнка', 'он', 'видел', 'их',
        'семью', 'своими', 'глазами'
    ]
    #X_predict=['символ']
    y_predict = t.pos_word_by_ml(X_predict)

    print([
        '%s/%s' % (X_predict[i], dg.get(y_predict[i]))
        for i in range(0, len(y_predict))
    ])
    r.make_report()
Пример #30
0
    def pos(self, df, mode_fast=True, use_cache=True):
        env = Environment()
        enc = Word_Encoder()
        df_res = df
        t_start = timer()

        c = OpenCorpus()
        g = c.grammemes()
        dg = g.to_dict().get('name')

        #Cache file
        cache_columns = ['word', 'gram_ml', 'count']
        file_cache = env.filename_mlcache_csv()
        try:
            df_cache = pd.read_csv(file_cache,
                                   index_col='idcorpus',
                                   encoding='utf-8')
        except:
            env.debug(
                1,
                ['POSTagger', 'pos', 'Failed to read cache file:', file_cache])
            df_cache = pd.DataFrame(columns=cache_columns)
        else:
            env.debug(1, ['POSTagger', 'pos', 'Read ML cache OK:', file_cache])

        a_predict = np.array([enc.word2token('')])
        #a_words = ['']
        n_words = df_res.shape[0]

        env.debug(1, [
            'POStagger', 'pos',
            'START Vocabulary prediction %s words' % n_words
        ])
        a_words = df_res['word'].tolist()
        a_ml_words = []
        predictions_voc = self.pos_by_voc(a_words)
        p_se = pd.Series(predictions_voc)
        df_res['gram'] = p_se.values
        df_res['gram_voc'] = p_se.values
        df_res['gram_ml'] = ''
        t_end = timer()
        env.debug(1, [
            'POStagger', 'pos',
            'END Vocabulary prediction %s sec.' % env.job_time(t_start, t_end)
        ])
        #print(predictions_voc)

        if mode_fast:
            #env.debug(1, ['POStagger', 'pos', 'START Fast mode vocabulary search. Words %s' % df.shape[0]])
            df_ni_voc = df_res[df_res['gram_voc'] == '']
            n_words = df_ni_voc.shape[0]
        else:
            df_ni_voc = df_res
        #print('non-vocabulary',df_ni_voc)
        if not df_ni_voc.empty:
            env.debug(
                1, ['POStagger', 'pos',
                    'START Encoding %s words' % n_words])
            for index, serie in df_ni_voc.iterrows():
                word = df_ni_voc.at[index, 'word']
                #print(word)
                a_padd = np.array([enc.word2token(word)])
                a_predict = np.append(a_predict, a_padd, axis=0)
                a_ml_words.append(word)
                #print(a_words, a_predict)
            a_predict = a_predict[1:, :]
            #print(a_predict)
            #print('ml_words',a_ml_words)
            t_end = timer()
            env.debug(1, [
                'POStagger', 'pos',
                'END Encoding %s words %s sec.' %
                (n_words, env.job_time(t_start, t_end))
            ])

        t_start = timer()
        env.debug(1, ['POStagger', 'pos', 'START Model prediction'])
        clf = pickle.load(open(env.filename_model_tree(), 'rb'))
        predictions_ml = clf.predict(a_predict[:, 0:])
        # print('ml', predictions_ml)
        t_end = timer()
        env.debug(1, [
            'POStagger', 'pos',
            'END Model prediction %s sec.' % env.job_time(t_start, t_end)
        ])
        #print('ml_words_prediction',list(zip(a_ml_words,predictions_ml)))

        t_start = timer()
        i = 0
        s_pvoc = ''
        s_pml = ''
        for index, row in df_res.iterrows():
            word = df_res.at[index, 'word']
            s_pvoc = df_res.at[index, 'gram_voc']
            #s_pvoc = predictions_voc[i]
            #print('s_pvoc', word, s_pvoc)
            #df_res.at[index, 'gram_voc'] = s_pvoc
            if s_pvoc == '':
                if mode_fast:
                    try:
                        j = a_ml_words.index(word)
                    except:
                        pass
                    else:
                        s_pml = dg.get(predictions_ml[j])
                        #print(word,s_pml)
                else:
                    s_pml = dg.get(predictions_ml[i])
                df_res.at[index, 'gram_ml'] = s_pml
                df_res.at[index, 'gram'] = s_pml
            i = i + 1
        t_end = timer()
        env.debug(1, [
            'POStagger', 'pos',
            'ML predictions dataframe filled %s sec' %
            env.job_time(t_start, t_end)
        ])
        #print(df_res)
        df_cache = pd.concat([
            df_cache,
            df_res[df_res.gram_ml != ''][['word', 'gram_ml', 'count']]
        ])
        df_cache = df_cache.groupby(['word',
                                     'gram_ml']).agg({'count': ['sum']})
        df_cache.reset_index(inplace=True)
        df_cache.index.name = 'idcorpus'
        df_cache.columns = cache_columns
        df_cache.sort_values(by=['count'], inplace=True, ascending=False)
        #print(df_cache)
        env.debug(1,
                  ['POStagger', 'pos', 'Write ML cache to CSV:', file_cache])
        df_cache.to_csv(file_cache, encoding='utf-8')
        return df_res
Пример #31
0
    def tokenize(self, dftokenz=pd.DataFrame(), persistent=True, n_frac=1):
        env = Environment()
        enc = Word_Encoder()
        t_start = timer()
        if dftokenz.empty:
            dftokenz = self.tokenz()
        if n_frac < 1:
            dftokenz = dftokenz.sample(frac=n_frac)
        env.debug(
            1, ['Transforming to tokenz: START %s words' % dftokenz.shape[0]])

        gmask = dftokenz.groupby(['gram'])
        df_posstat = gmask.count()
        df_posstat.to_csv(env.filename_stat_pos_tokenz_csv(), encoding='utf-8')
        print('POSTagger', 'train dataset stat:\n', gmask.count())

        fields = [
            's_suffix2', 's_suffix3', 's_prefix2', 's_prefix3', 'n_token',
            'n_len', 'n_tokens2', 'n_tokens3', 'n_tokenp2', 'n_tokenp3'
        ]

        for field in fields:
            val = 0.0
            if field[0] == 's':
                val = ''
            dftokenz[field] = val

        n_letters = 0
        s_letters = env.list_rus_letters()
        di_letters = env.di_bgm_byletters
        #bgm_columns_i = env.bgm_columns_list(mode=0)
        bgm_columns = env.bgm_columns_list(mode=1)

        #print('bgm_columns', bgm_columns)
        for column_name in bgm_columns:
            dftokenz[column_name] = None

        t_end = timer()
        env.debug(1, [
            'POStagger', 'Letters bigram columns added',
            env.job_time(t_start, t_end)
        ])

        #Form tokenz
        t_start = timer()
        for index, serie in dftokenz.iterrows():
            # print (serie.values)
            a_word = enc.s2token(index, serie)
            i = 2
            # print(a_word)
            for field in fields:
                dftokenz.at[index, field] = a_word[i]
                # print(field, a_word[i])
                i = i + 1
            # print(dftokenz.loc[index])
            #Letters bigram binaries
            for n_l in range(0, len(a_word[0]) - 1):
                n_l2 = n_l + 1
                di_n = di_letters.get('%s%s' %
                                      (a_word[0][n_l], a_word[0][n_l2]))
                if di_n is not None:
                    #print(di_n)
                    #print(bgm_columns[di_n])
                    dftokenz.at[index, bgm_columns[di_n]] = 1
        t_end = timer()
        env.debug(
            1,
            ['Transforming to tokenz: COMPLETE',
             env.job_time(t_start, t_end)])
        if persistent:
            dftokenz.to_csv(env.filename_tokenz_csv(), encoding='utf-8')
            env.debug(1, ['Tokenz written to CSV:', env.filename_tokenz_csv()])
        return dftokenz