def stat(self): env = Environment() data = pd.DataFrame() file_stat = env.filename_results_csv() try: data = pd.read_csv(file_stat, index_col='idstat', encoding='utf-8') except: env.debug(1, ['Failed to read stat file:', file_stat]) else: env.debug(1, ['Read stat file OK:', file_stat]) #print(data) return data
def tokenz(self): env = Environment() df_tokenz = pd.DataFrame() file_tokenz = env.filename_tokenz_csv() try: df_tokenz = pd.read_csv(file_tokenz, index_col='idcorpus', encoding='utf-8') except: env.debug(1, ['Failed to read tokenz file:', file_tokenz]) else: env.debug(1, ['Read tokenz OK:', file_tokenz]) return df_tokenz
def authors(self, mode=0): env = Environment() df = pd.DataFrame() filename = env.filename_authors_csv() try: df = pd.read_csv(filename, index_col='idauthor', encoding='utf-8') except: env.debug(1, ['Failed to load authors CSV file', filename]) else: env.debug(1, ['Load authors CSV file', filename]) if mode == 1: return df.to_dict().get('name') else: return df
def grammemes(self, mode=0): env = Environment() dfgram = pd.DataFrame() filename_gram = env.filename_grammemes_csv() try: dfgram = pd.read_csv(filename_gram, index_col='idgram', encoding='utf-8') except: env.debug(1, ['Failed to load grammemes CSV file', filename_gram]) else: env.debug(1, ['Load grammemes CSV file', filename_gram]) if mode == 1: return dfgram.to_dict().get('name') else: return dfgram
def corpus_xml2txt(self, num=1, persistent=True): result = True env = Environment() file_xml = env.filename_corpus_xml(num) try: tree = ET.ElementTree(file=file_xml) except: env.debug(1, ['Failed to load XML:', file_xml]) result = False else: file_txt = env.filename_corpus_txt(num) file = open(file_txt, mode='w') for elem in tree.iter('source'): # print(elem.text, elem.tag, elem.attrib) file.write(elem.text) file.write(' ') file.close() env.debug(1, ['Write corpus file to TXT:', file_txt]) return result
def test(self, n_min=1, n_max=1): t_start = timer() env = Environment() df_test = pd.DataFrame() for i in range(n_min, n_max + 1): try: dffile = pd.read_csv(env.filename_corpus_csv(i), index_col='idcorpus', encoding='utf-8') except: env.debug(1, [ 'POStagger', 'test', 'Failed to read corpus file:', env.filename_corpus_csv(i) ]) else: env.debug(1, [ 'POStagger', 'test', 'Read OK:', env.filename_corpus_csv(i) ]) if not dffile.empty: df_test = df_test.append(dffile) df_test = df_test.drop_duplicates() df_test.columns = ['word', 'gram', 'idgram'] df_test = df_test.reset_index(drop=True) df_test.index.name = 'idcorpus' df_test['gram_valid'] = df_test['gram'] n_testsize = df_test.shape[0] env.debug(1, ['POStagger', 'test', 'START %s words' % n_testsize]) df_test = self.pos(df_test) print('Test result', df_test) df_err = df_test[df_test['gram_valid'] != df_test['gram']] print('Test errors:', df_err) df_err.to_csv(env.filename_test_err_csv(), encoding='utf-8') env.debug(1, [ 'POStagger', 'test', 'test accuracy %s' % (1 - df_err.shape[0] / n_testsize) ]) t_end = timer() env.debug(1, [ 'POSTagger', 'test', 'test time:', env.job_time(t_start, t_end), 'sec.' ])
def vocabulary_from_corpus(self, n_min=1, n_max=10, persistent=True): env = Environment() df_voc = pd.DataFrame() #dfgram = self.grammemes() for i in range(n_min, n_max + 1): file_csv = env.filename_corpus_csv(i) try: dffile = pd.read_csv(file_csv, index_col='idcorpus', encoding='utf-8') except: env.debug(1, ['Failed to read corpus file:', file_csv]) else: env.debug(1, ['Read OK:', file_csv]) if not dffile.empty: df_voc = df_voc.append(dffile) df_voc = df_voc.drop_duplicates() df_voc.columns = ['word', 'gram', 'idgram'] df_voc = df_voc.reset_index(drop=True) df_voc.index.name = 'idcorpus' if persistent: file_voc = env.filename_vocabulary_csv() env.debug(1, ['Write vocabulary to CSV:', file_voc]) df_voc.to_csv(file_voc, encoding='utf-8') return df_voc
def corpus_xml2csv(self, num=1, persistent=True): env = Environment() file_xml = env.filename_corpus_xml(num) df_xml = pd.DataFrame() df_gram = self.grammemes() dgram = df_gram.to_dict().get('name') try: tree = ET.ElementTree(file=file_xml) except: env.debug(1, ['Failed to load XML:', file_xml]) else: t_start = timer() env.debug(1, ['CORPUS', 'XML to CSV:', file_xml]) for elem in tree.iter('token'): #print(elem.tag, elem.attrib) serie = pd.Series(data=[]) badd = False s_text = elem.attrib.get('text') serie[len(serie)] = s_text.lower() for elem2 in elem.iter('g'): #print(elem2.tag, elem2.attrib) sgram = elem2.attrib.get('v') sgram = sgram.upper() if (df_gram[df_gram['name'].isin([sgram]) == True].size ) > 0: serie[len(serie)] = sgram serie[len(serie)] = int(df_gram.index[ df_gram['name'] == sgram].tolist()[0]) #serie[len(serie)] = list(dgram.keys())[list(dgram.values()).index(sgram)] badd = True break #print(s) if badd: df_xml = df_xml.append(serie, ignore_index=True) if not df_xml.empty: df_xml = df_xml.drop_duplicates() df_xml = df_xml.reset_index(drop=True) df_xml.index.name = 'idcorpus' df_xml.columns = ['word', 'gram', 'idgram'] df_xml = df_xml.astype({"idgram": int}) if persistent: file_csv = env.filename_corpus_csv(num) env.debug(1, ['Write corpus file to CSV:', file_csv]) df_xml.to_csv(file_csv, encoding='utf-8') t_end = timer() env.debug(1, [ 'CORPUS', 'CSV written:', file_csv, 'takes %s sec.' % env.job_time(t_start, t_end) ]) return df_xml
def grammemes_xml2csv(self, persistent=True): env = Environment() filename_gram = env.filename_grammemes_xml() dfcols = ['name', 'alias', 'description'] df_xml = pd.DataFrame(columns=dfcols) try: tree = ET.ElementTree(file=filename_gram) except: env.debug(1, ['Failed to load grammemes from XML:', filename_gram]) else: env.debug(1, ['Read grammemes:', filename_gram]) for elem in tree.iter('grammeme'): #print(elem.tag, elem.attrib) sattr = elem.attrib.get('include') if sattr == 'on': sname = sali = sdesc = '' for child in elem: if child.tag.lower() == 'name': sname = child.text.upper() elif child.tag.lower() == 'alias': sali = child.text.upper() elif child.tag.lower() == 'description': sdesc = child.text.lower() s = pd.Series(data=[sname, sali, sdesc], index=dfcols) df_xml = df_xml.append(s, ignore_index=True) df_xml.index.name = 'idgram' if persistent: filename_csv = env.filename_grammemes_csv() env.debug(1, ['Write grammemes to CSV:', filename_csv]) df_xml.to_csv(filename_csv, encoding='utf-8') return df_xml
def tokenz_create_stat(self, dftokenz=pd.DataFrame(), n_frac=1): env = Environment() enc = Word_Encoder() di_letters = Environment.di_bgm_byletters bgm_columns = env.bgm_columns_list(mode=1) t_start = timer() if dftokenz.empty: dftokenz = self.tokenz() if n_frac < 1: dftokenz = dftokenz.sample(frac=n_frac) env.debug(1, [ 'POStagger', 'create_stat', 'Collecting statistic START %s words' % dftokenz.shape[0] ]) di_tokenz_stat = (dftokenz.count()).to_dict() di_tokenz_res = {} #print('di_letters', di_letters) print('di_tokenz_stat', di_tokenz_stat) bgm_astat = [['init', 0]] bgm_index = [] for key in di_letters: di_n = di_letters.get(key) column_stat = di_tokenz_stat.get(bgm_columns[di_n]) #di_tokenz_res[key] = column_stat bgm_astat.append([key, column_stat]) bgm_index.append(di_n) bgm_astat = bgm_astat[1:] print('column stat', bgm_astat) df_bgm_stat = pd.DataFrame(data=bgm_astat, columns=['bigram', 'counts'], index=bgm_index) df_bgm_stat.index.name = 'idbigram' df_bgm_stat = df_bgm_stat.sort_values(by=['counts'], ascending=False) print('bgm_stat\n', df_bgm_stat) df_bgm_stat.to_csv(env.filename_stat_bigram_letters_csv(), encoding='utf-8')
def vocabulary(self): env = Environment() file_voc = env.filename_vocabulary_csv() #from vocabulary file file_dict = env.filename_dict_csv() #from dictionary file try: df_voc = pd.read_csv(file_voc, index_col='idcorpus', encoding='utf-8') except: env.debug(1, ['Failed to read vocabulary file:', file_voc]) else: env.debug(1, ['Read vocabulary OK:', file_voc]) try: df_dict = pd.read_csv(file_dict, index_col='idcorpus', encoding='utf-8') except: env.debug(1, ['Failed to read dictionary file:', file_dict]) else: env.debug(1, ['Read dictionary OK:', file_dict]) #Concat df_res = pd.concat([df_voc, df_dict]) df_res = df_res.drop_duplicates() #Apply patch words df_patch = pd.read_csv(env.filename_vocabulary_patch_csv(), index_col='idcorpus', encoding='utf-8') df_res = df_res.drop(df_res[df_res['word'].isin( df_patch['word'])].index, axis=0) df_res = pd.concat([df_res, df_patch]) #print(df_res[df_res['word'].isin(df_patch['word'])]) df_res = df_res.reset_index(drop=True) df_res.index.name = 'idcorpus' #print(df_res) return df_res
def process_from_texts_file(self, aidtext, mode='process', max_words=0): env = Environment() file_res = env.filename_results_csv() dfres = pd.read_csv( file_res, index_col='idstat', encoding='utf-8') #Файл для записи статистических результатов #dfres = env.downcast_dtypes(dfres) df_texts = pd.read_csv(env.filename_texts_csv(), index_col='idtext', encoding='utf-8') #Реестр текстов mask = df_texts.index.isin(aidtext) df_texts = df_texts[mask] for index, row in df_texts.iterrows( ): #Для каждого текста, который надо обработать file_txt = df_texts.at[index, 'filename'] #Read text file env.debug(1, ['START file TXT:', file_txt]) t_start = timer() #file = open(file_txt, 'r') file = codecs.open(file_txt, "r", "utf_8_sig") text = file.read().strip() file.close() # print(text) #Автор в обучающей выборке указанг idauthor = df_texts.at[index, 'idauthor'] #Автор name = df_texts.at[index, 'name'] #Название columns = dfres.columns if mode == 'process': #если необходимо собрать информацию о тексте и записать её в results #Собственно обработка текста df_add = self.analyze_text( columns, text, index, idauthor, name, file_txt, max_words) #Analyze text, get Series df_add.reset_index(drop=True, inplace=True) dfres = dfres.append( df_add, ignore_index=True) #Добавляем к файлу результатов dfres.reset_index(drop=True, inplace=True) dfres.index.name = 'idstat' #print(dfres) #return 0 if mode == 'chunk_size': # если необходимо определить размер chunk n_chunk_size = self.validate_chunk_size( columns, text, index, idauthor, name, file_txt) t_end = timer() env.debug(1, [ 'END file TXT:', file_txt, 'time:', env.job_time(t_start, t_end) ]) # print(dfres.head()) #Сохраняем результат на диск if mode == 'process': #dfres = dfres.reset_index(drop=True) int_cols = [ 'idtext', 'idchunk', 'idauthor', 'sentences_text', 'words_text', 'sentences_chunk', 'words_chunk', 'words_uniq_chunk' ] for col in int_cols: dfres[col] = dfres[col].astype(int) #dfres = env.downcast_dtypes(dfres) dfres.to_csv(file_res, encoding='utf-8')
def predict(self, aidtext, b_makestat=False): env = Environment() # Открываем файл со статистикой по тестовым текстам df_stat = pd.read_csv( env.filename_stat_test_csv(), index_col='idstat', encoding='utf-8') # Статистика по тстовым текстам df_texts = pd.read_csv(env.filename_predict_csv(), index_col='idtext', encoding='utf-8') # Реестр текстов mask = df_texts.index.isin(aidtext) df_texts = df_texts[mask] columns = ['idtext', 'idchunk', 'idauthor', 'author', 'name', 'file', \ 'sentences_text', 'words_text','sentence_mean', \ 'sentences_chunk', 'words_chunk', 'words_uniq_chunk','uniq_per_sent_chunk','uniq_per_words_chunk', \ 'NOUN','ADJF','ADJS','COMP','VERB','INFN','PRTF','PRTS','GRND','NUMR',\ 'ADVB','NPRO','PRED','PREP','CONJ','PRCL','INTJ', 'predict'] y_result = [] #Если необходимо подготовить статистику по тестовым текстам if b_makestat: for index, row in df_texts.iterrows( ): # Для каждого текста, который надо обработать file_txt = df_texts.at[index, 'filename'] # Read text file env.debug(1, ['Analyzer', 'predict', 'START file TXT:', file_txt]) t_start = timer() file = codecs.open(file_txt, "r", "utf_8_sig") text = file.read().strip() file.close() # Автор в тестовой выборке вообще говоря нет idauthor = df_texts.at[index, 'idauthor'] # Автор #idauthor = 0 name = df_texts.at[index, 'name'] # Название # Собственно обработка текста df_add = self.analyze_text( columns, text, index, idauthor, name, file_txt) # Analyze text, get Series #print(df_add) df_add.reset_index(drop=True, inplace=True) df_stat = df_stat.append( df_add, ignore_index=True) #Добавляем к файлу результатов df_stat.reset_index(drop=True, inplace=True) df_stat.index.name = 'idstat' t_end = timer() env.debug(1, [ 'END file TXT:', file_txt, 'time:', env.job_time(t_start, t_end) ]) #df_stat теперь содержит информацию о всех тестовых текстах, которые хотели обработать #Указываем верный тип для целочисленных колонок int_cols = [ 'idtext', 'idchunk', 'idauthor', 'sentences_text', 'words_text', 'sentences_chunk', 'words_chunk', 'words_uniq_chunk' ] for col in int_cols: df_stat[col] = df_stat[col].astype(int) # Сохраняем результат на диск df_stat.to_csv(env.filename_stat_test_csv(), encoding='utf-8') #Статистика готова # Открываем файл со статистикой по тестовым текстам df_stat = pd.read_csv( env.filename_stat_test_csv(), index_col='idstat', encoding='utf-8') # Статистика по тстовым текстам #mask = df_stat.index.isin(aidtext) #df_stat2predict = df_stat[mask] #Предсказываем авторов y_res = self.model_predict(df_stat.loc[aidtext]) #print(y_res) df_stat.loc[aidtext, 'predict'] = y_res.astype(int) #print(df_stat) #y_result.append(y_res[0]) #Сохраняем измененный файл с предсказаниями df_stat.to_csv(env.filename_stat_test_csv(), encoding='utf-8') return y_res #Возвращаем предсказания
def tokenize(self, dftokenz=pd.DataFrame(), persistent=True, n_frac=1): env = Environment() enc = Word_Encoder() t_start = timer() if dftokenz.empty: dftokenz = self.tokenz() if n_frac < 1: dftokenz = dftokenz.sample(frac=n_frac) env.debug( 1, ['Transforming to tokenz: START %s words' % dftokenz.shape[0]]) gmask = dftokenz.groupby(['gram']) df_posstat = gmask.count() df_posstat.to_csv(env.filename_stat_pos_tokenz_csv(), encoding='utf-8') print('POSTagger', 'train dataset stat:\n', gmask.count()) fields = [ 's_suffix2', 's_suffix3', 's_prefix2', 's_prefix3', 'n_token', 'n_len', 'n_tokens2', 'n_tokens3', 'n_tokenp2', 'n_tokenp3' ] for field in fields: val = 0.0 if field[0] == 's': val = '' dftokenz[field] = val n_letters = 0 s_letters = env.list_rus_letters() di_letters = env.di_bgm_byletters #bgm_columns_i = env.bgm_columns_list(mode=0) bgm_columns = env.bgm_columns_list(mode=1) #print('bgm_columns', bgm_columns) for column_name in bgm_columns: dftokenz[column_name] = None t_end = timer() env.debug(1, [ 'POStagger', 'Letters bigram columns added', env.job_time(t_start, t_end) ]) #Form tokenz t_start = timer() for index, serie in dftokenz.iterrows(): # print (serie.values) a_word = enc.s2token(index, serie) i = 2 # print(a_word) for field in fields: dftokenz.at[index, field] = a_word[i] # print(field, a_word[i]) i = i + 1 # print(dftokenz.loc[index]) #Letters bigram binaries for n_l in range(0, len(a_word[0]) - 1): n_l2 = n_l + 1 di_n = di_letters.get('%s%s' % (a_word[0][n_l], a_word[0][n_l2])) if di_n is not None: #print(di_n) #print(bgm_columns[di_n]) dftokenz.at[index, bgm_columns[di_n]] = 1 t_end = timer() env.debug( 1, ['Transforming to tokenz: COMPLETE', env.job_time(t_start, t_end)]) if persistent: dftokenz.to_csv(env.filename_tokenz_csv(), encoding='utf-8') env.debug(1, ['Tokenz written to CSV:', env.filename_tokenz_csv()]) return dftokenz
def dict_xml2csv(self, persistent=True, lines=10000): t_start = timer() env = Environment() dfgram = self.grammemes() filename_dict = env.filename_dict_xml() dfcols = ['word', 'gram', 'idgram'] df_xml = pd.DataFrame(columns=dfcols) env.debug( 1, ['CORPUS', 'Start to load dictionary from XML:', filename_dict]) try: fp = io.open(filename_dict, mode="r", encoding="utf-8") except: env.debug(1, [ 'CORPUS', 'Failed to open dictionary file XML:', filename_dict ]) else: number_lines = sum(1 for line in fp) fp.seek(0) t_end = timer() env.debug(1, [ 'CORPUS', 'File opened:', 'lines', '%s' % number_lines, 'time:', env.job_time(t_start, t_end) ]) t_start = timer() step = number_lines // lines env.debug(1, [ 'CORPUS', 'Read dictionary:', filename_dict, 'lines: %s step %s' % (lines, step) ]) n_line = 0 for i in range(0, number_lines): line = fp.readline() #print(line[5:10]) if (line[5:10] == 'lemma') and (n_line == 0): #print(line) tree = ET.fromstring(line) for elem in tree.iter('l'): s_word = elem.attrib.get('t') gram = ['', 0] j = 0 for elem2 in elem.iter('g'): gram[j] = elem2.attrib.get('v') break gram[1] = int(dfgram.index[dfgram['name'] == gram[0]].tolist()[0]) #print(s_word,gram) s = pd.Series(data=[s_word, gram[0], gram[1]], index=dfcols) df_xml = df_xml.append(s, ignore_index=True) n_line += 1 n_line += 1 if n_line >= step: n_line = 0 fp.close() df_xml.index.name = 'idcorpus' t_end = timer() env.debug(1, [ 'CORPUS', 'Dictionary loaded:', 'time:', env.job_time(t_start, t_end) ]) if persistent: filename_csv = env.filename_dict_csv() env.debug(1, ['CORPUS', 'Write dictionary to CSV:', filename_csv]) df_xml.to_csv(filename_csv, encoding='utf-8') env.debug(1, ['CORPUS', 'Dictionary saved:', filename_csv]) return df_xml
def pos(self, df, mode_fast=True, use_cache=True): env = Environment() enc = Word_Encoder() df_res = df t_start = timer() c = OpenCorpus() g = c.grammemes() dg = g.to_dict().get('name') #Cache file cache_columns = ['word', 'gram_ml', 'count'] file_cache = env.filename_mlcache_csv() try: df_cache = pd.read_csv(file_cache, index_col='idcorpus', encoding='utf-8') except: env.debug( 1, ['POSTagger', 'pos', 'Failed to read cache file:', file_cache]) df_cache = pd.DataFrame(columns=cache_columns) else: env.debug(1, ['POSTagger', 'pos', 'Read ML cache OK:', file_cache]) a_predict = np.array([enc.word2token('')]) #a_words = [''] n_words = df_res.shape[0] env.debug(1, [ 'POStagger', 'pos', 'START Vocabulary prediction %s words' % n_words ]) a_words = df_res['word'].tolist() a_ml_words = [] predictions_voc = self.pos_by_voc(a_words) p_se = pd.Series(predictions_voc) df_res['gram'] = p_se.values df_res['gram_voc'] = p_se.values df_res['gram_ml'] = '' t_end = timer() env.debug(1, [ 'POStagger', 'pos', 'END Vocabulary prediction %s sec.' % env.job_time(t_start, t_end) ]) #print(predictions_voc) if mode_fast: #env.debug(1, ['POStagger', 'pos', 'START Fast mode vocabulary search. Words %s' % df.shape[0]]) df_ni_voc = df_res[df_res['gram_voc'] == ''] n_words = df_ni_voc.shape[0] else: df_ni_voc = df_res #print('non-vocabulary',df_ni_voc) if not df_ni_voc.empty: env.debug( 1, ['POStagger', 'pos', 'START Encoding %s words' % n_words]) for index, serie in df_ni_voc.iterrows(): word = df_ni_voc.at[index, 'word'] #print(word) a_padd = np.array([enc.word2token(word)]) a_predict = np.append(a_predict, a_padd, axis=0) a_ml_words.append(word) #print(a_words, a_predict) a_predict = a_predict[1:, :] #print(a_predict) #print('ml_words',a_ml_words) t_end = timer() env.debug(1, [ 'POStagger', 'pos', 'END Encoding %s words %s sec.' % (n_words, env.job_time(t_start, t_end)) ]) t_start = timer() env.debug(1, ['POStagger', 'pos', 'START Model prediction']) clf = pickle.load(open(env.filename_model_tree(), 'rb')) predictions_ml = clf.predict(a_predict[:, 0:]) # print('ml', predictions_ml) t_end = timer() env.debug(1, [ 'POStagger', 'pos', 'END Model prediction %s sec.' % env.job_time(t_start, t_end) ]) #print('ml_words_prediction',list(zip(a_ml_words,predictions_ml))) t_start = timer() i = 0 s_pvoc = '' s_pml = '' for index, row in df_res.iterrows(): word = df_res.at[index, 'word'] s_pvoc = df_res.at[index, 'gram_voc'] #s_pvoc = predictions_voc[i] #print('s_pvoc', word, s_pvoc) #df_res.at[index, 'gram_voc'] = s_pvoc if s_pvoc == '': if mode_fast: try: j = a_ml_words.index(word) except: pass else: s_pml = dg.get(predictions_ml[j]) #print(word,s_pml) else: s_pml = dg.get(predictions_ml[i]) df_res.at[index, 'gram_ml'] = s_pml df_res.at[index, 'gram'] = s_pml i = i + 1 t_end = timer() env.debug(1, [ 'POStagger', 'pos', 'ML predictions dataframe filled %s sec' % env.job_time(t_start, t_end) ]) #print(df_res) df_cache = pd.concat([ df_cache, df_res[df_res.gram_ml != ''][['word', 'gram_ml', 'count']] ]) df_cache = df_cache.groupby(['word', 'gram_ml']).agg({'count': ['sum']}) df_cache.reset_index(inplace=True) df_cache.index.name = 'idcorpus' df_cache.columns = cache_columns df_cache.sort_values(by=['count'], inplace=True, ascending=False) #print(df_cache) env.debug(1, ['POStagger', 'pos', 'Write ML cache to CSV:', file_cache]) df_cache.to_csv(file_cache, encoding='utf-8') return df_res
def analyze_text(self, columns, text_to_analyze, index=0, idauthor=0, name='', file_txt='', max_words=0): env = Environment() t_start = timer() env.debug( 1, ['Analyzer', 'analyze_text', 'START file TXT: %s' % file_txt]) enc = Word_Encoder() postg = POSTagger() corpus = OpenCorpus() dfgram = corpus.grammemes() file_authors = env.filename_authors_csv() #Информация об авторах authors = pd.read_csv(file_authors, index_col='idauthor', encoding='utf-8') dfres = pd.DataFrame() #Пустой dataframe для сохранения результат #Preprocessing: выполнить прдварительную обработку текста #max_words = 6000 achunks = self.preprocessor(text_to_analyze, max_words) #print(achunks) n_chunks = len(achunks) # на сколько частей разделён текст #на выходе получили массив, каждывй элемент которого содеоржит число предложений, число слов в тексте, массив со словами env.debug(1, [ 'Analyzer', 'analyze_text', '%s sentences %s words in %s chunks' % (achunks[0][0], achunks[0][1], n_chunks) ]) #print(n_chunks) a_text_corp = [] id_chunk = 0 for chunk in achunks: t_start = timer() #prepare data n_sent_all = chunk[0] n_words_all = chunk[1] n_sent_len_mean = chunk[2] n_sent_chunk = chunk[3] n_words_chunk = chunk[4] a_text_words = chunk[5] #print(n_sent_all, n_words_all, n_sent_len_mean, n_sent_chunk, n_words_chunk, a_text_words) #print(len(a_text_words)) # Vectorize - к каждой части относимся как к индивидуальному тексту vectorizer = CountVectorizer(encoding='utf-8', token_pattern=r"(?u)\b\w+\b") #Преобразуем все слова в матрицу из одной строки (0) и множества колонок, где каждому слову соотвествует # колонка, а количество вхождений слова в документе - значение в этой колонке #print([' '.join(map(str,a_text_words))]) X = vectorizer.fit_transform([' '.join(map(str, a_text_words))]) #print(X) n_words_chunk_check = X.sum( ) #Сколько всего слов в документе, который обрабатываем #print(n_words_chunk, n_words_chunk_check) #print(vectorizer.get_stop_words()) env.debug(1, [ 'Analyzer', 'analyze_text', 'START process chunk %s/%s with %s words' % (id_chunk, n_chunks - 1, n_words_chunk) ]) word_freq = np.asarray(X.sum(axis=0)).ravel( ) #для каждого слова его суммарное число (т.к. у нас одна строка == числу в ней) #print(vectorizer.get_feature_names()) #print(X) zl = zip(vectorizer.get_feature_names(), word_freq) # words, count #print(list(zl)) data_cols = ['gram', 'gram_voc', 'gram_ml'] data = pd.DataFrame(list(zl), columns=['word', 'count']) for col in data_cols: data[col] = '' t_end = timer() env.debug( 1, ['Ready for POS:', 'time:', env.job_time(t_start, t_end)]) t_start = timer() data = postg.pos(data) #print(data) t_end = timer() env.debug(1, ['POS tagged:', 'time:', env.job_time(t_start, t_end)]) t_start = timer() grouped = data.sort_values('gram').groupby(['gram']).agg( {'count': ['sum']}) grouped.columns = ['n_POS'] grouped.reset_index(inplace=True) grouped['f_POS'] = grouped['n_POS'] / n_words_chunk #grouped.drop(columns=['n_POS'], inplace=True) #print(grouped) #print(grouped.set_index('gram').T) grouped = pd.merge( dfgram, grouped, left_on='name', right_on='gram', how='left').drop( columns=['alias', 'description', 'name', 'n_POS']).fillna( 0).set_index('gram').T #grouped = pd.merge(dfgram, grouped, left_on='name', right_on='gram', how='left').fillna(0).set_index('gram') #print(grouped) #print(grouped.values.ravel()) index_author = authors.index.get_loc(idauthor) n_uniq_words = data.shape[0] s_chunk = pd.Series({ 'idtext': index, 'idchunk': id_chunk, 'idauthor': idauthor, 'author': authors.at[index_author, 'shortname'], 'name': name, 'file': file_txt, 'sentences_text': np.int64(n_sent_all), 'words_text': np.int64(n_words_all), 'sentence_mean': n_sent_len_mean, 'sentences_chunk': np.int64(n_sent_chunk), 'words_chunk': np.int64(n_words_chunk), 'words_uniq_chunk': np.int64(n_uniq_words), 'uniq_per_sent_chunk': round(n_uniq_words / n_sent_chunk, 4), 'uniq_per_words_chunk': round(n_uniq_words / n_words_chunk, 4) }) s_chunk = pd.concat( [s_chunk, pd.Series(grouped.values.ravel())], ignore_index=True) s_chunk = pd.concat([s_chunk, pd.Series([np.nan])], ignore_index=True) #print(s_chunk) #print(grouped) t_end = timer() env.debug(1, ['Analyzed', 'time:', env.job_time(t_start, t_end)]) dfres = dfres.append(s_chunk, ignore_index=True) #dfres = env.downcast_dtypes(dfres) id_chunk = id_chunk + 1 print(dfres) print(columns) dfres.columns = columns return dfres
def train(self, df=pd.DataFrame(), validation='eval', n_splits=5, b_smoketest=True, n_frac=1): env = Environment() enc = Word_Encoder() df_train = df bgm_columns = env.bgm_columns_list(mode=1) drop_columns = [ 'word', 'gram', 's_suffix2', 's_suffix3', 's_prefix2', 's_prefix3', 'n_token' ] #, 'bgm_l_None' #drop_columns.extend(['bgm_l_%s' % (i) for i in range(1, env.bgm_columns_max()) if 'bgm_l_%s' % (i) not in bgm_columns]) env.debug(1, ['POStagger', 'train', 'Drop colums: %s' % (drop_columns)]) if df_train.empty: t_start = timer() df_train = self.tokenz() t_end = timer() env.debug(1, [ 'POSTagger', 'train', 'tokenz loaded:', 'time:', env.job_time(t_start, t_end) ]) env.debug(1, [ 'POStagger', 'train', 'All tokenz set shape %s' % df_train.shape[0] ]) t_start = timer() env.debug(1, ['POStagger', 'train', 'Learning: START']) if n_frac < 1: df_train = df_train.sample(frac=n_frac) env.debug(1, [ 'POStagger', 'train', 'Training tokenz set shape %s' % df_train.shape[0] ]) #print(df_train.shape) #df_train2 = df_train[bgm_columns] #print(df_train2.shape) #df_train2 = df_train2.astype({"idgram": int}) df_train = df_train.drop(columns=drop_columns, axis=1) env.debug( 1, ['POStagger', 'Train colums: %s' % (df_train.columns.tolist())]) #print(df_train.columns) #df_train = df_train.drop_duplicates() #slow-slow #print(df_train.head()) df_train = df_train.fillna(0) file_x = env.filename_xtrain_csv() df_train.to_csv(file_x, encoding='utf-8') env.debug(1, ['POStagger', 'train', 'Save X', file_x]) y = df_train['idgram'].values df_train.drop(columns=['idgram'], inplace=True) X = df_train.values #array = df_train.values #print(df_train) #X = array[:, 1:] #Y = array[:, 0] #print(X, Y) #validation_size = 0.20 seed = 241 frac_test_size = 0.2 sc = StandardScaler() #Y_sc = sc.fit_transform(Y) t2_start = timer() if validation == 'cv': #Need cross-validation scoring = 'accuracy' # scoring = 'f1_samples' kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed) if True: #Decision tree env.debug(1, ['Tree cross-validation']) # clf = DecisionTreeClassifier(criterion='gini', random_state=seed) # 0.79 # clf = KNeighborsClassifier(n_neighbors=230) model = DecisionTreeClassifier(criterion='entropy', random_state=seed) # 0.81 env.debug( 1, ['Calculate cross_val_score. Splits=%s' % (n_splits)]) scores = cross_val_score(model, X, y, cv=kf) print('DTree scores:', scores.mean(), 'raw', scores) if False: #Logistic regression env.debug(1, ['LGR cross-validation']) n_Cs = [0.01] X = array[:, 5:] X_sc = sc.fit_transform(X) Y = df_train['idgram'].values Y[Y > 0] = 1 print(X_sc, Y) for n_c in n_Cs: #clf = LogisticRegression(penalty='l2', solver='saga', C=n_c, multi_class='multinomial') clf = LogisticRegression(penalty='l2', solver='liblinear', C=n_c) # clf = SVC(kernel='linear', C=10000, random_state=241) # clf = SVC(kernel='linear', C=0.01, random_state=seed) # clf = SVC(random_state=seed) # clf = Perceptron() env.debug(1, [ 'Calculate cross_val_score. Splits=%s C=%s' % (n_splits, n_c) ]) scores = cross_val_score(clf, X_sc, Y, cv=kf) print(scores) if False: #GBM, RandomForest env.debug(1, ['GBM cross-validation']) asteps = [20] #GBM #asteps=[100] #RandomForest for i in asteps: #clf = RandomForestClassifier(n_estimators=i) clf = GradientBoostingClassifier( n_estimators=i, max_depth=8) #, max_features='sqrt' env.debug(1, [ 'Calculate cross_val_score. Splits=%s Estimators=%s' % (n_splits, i) ]) scores = cross_val_score(clf, X, Y, cv=kf) print(scores) if validation == 'eval': # eval model = xgb.XGBClassifier(n_estimators=140, max_depth=16, colsample=1, subsample=0.5, seed=seed) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=frac_test_size, random_state=seed, shuffle=True) eval_set = [(X_train, y_train), (X_test, y_test)] # print(eval_set) f_eval = 'merror' # f_eval = 'mlogloss' model.fit(X_train, y_train, eval_metric=f_eval, eval_set=eval_set, verbose=False, early_stopping_rounds=20) ev_scores = model.evals_result() ev_mean = np.array(ev_scores['validation_0'][f_eval]).mean() #print(model.feature_importances_) print(ev_mean, ev_scores) xgb.plot_importance(model) plt.show() t2_end = timer() t_end = timer() env.debug(1, ['CV completed:', 'time:', env.job_time(t_start, t_end)]) if validation == 'cv': #Training на всех данных X_train, y_train = X, y # model = SVC() # model= DecisionTreeClassifier() #79 # model= LinearDiscriminantAnalysis() #47 # model=LogisticRegression() #48 # model = KNeighborsClassifier(n_neighbors=200) #48 # model = GaussianNB() #43 #print('Fit...') #print('Validate...') # predictions = model.predict(X_validation) # print(accuracy_score(Y_validation, predictions)) # print(confusion_matrix(Y_validation, predictions)) # print(classification_report(Y_validation, predictions)) t_start = timer() env.debug(1, ['Training: START']) model.fit(X_train, y_train) t_end = timer() env.debug(1, ['Training: END', env.job_time(t_start, t_end)]) pickle.dump(sc, open(env.filename_scaler(), 'wb')) pickle.dump(model, open(env.filename_model_tree(), 'wb')) # Smoke test if b_smoketest: X_smoke_predict = [ 'съеште', 'ещё', 'этих', 'мягких', 'французских', 'булок' ] a_smoke = np.array( [enc.word2token(elem) for elem in X_smoke_predict]) y_predictions = model.predict(a_smoke[:, 0:]) y_predictions_proba = model.predict(a_smoke[:, 0:]) #print(y_predictions) print('Prediction', list(zip(X_smoke_predict, y_predictions))) print('Proba', list(zip(X_smoke_predict, y_predictions_proba))) return model
def vizualize2d(self, n_frac=0.01, b_annotations=False): n_components = 2 env = Environment() c = OpenCorpus() di_g = c.grammemes(mode=1) data = self.tokenz().sample(frac=n_frac) data = data.fillna(0) #print(data['idgram'].shape) #print(data.index.shape) tdf = pd.DataFrame(index=data.index) tdf['idgram'] = data['idgram'] tdf['gram'] = data['gram'] tdf['word'] = data['word'] #print(tdf) drop_columns = [ 'word', 'gram', 's_suffix2', 's_suffix3', 's_prefix2', 's_prefix3', 'n_token' ] # , 'bgm_l_None' # drop_columns.extend(['bgm_l_%s' % (i) for i in range(1, env.bgm_columns_max()) if 'bgm_l_%s' % (i) not in bgm_columns]) env.debug( 1, ['POStagger', 'visualize2D', 'Drop colums: %s' % (drop_columns)]) data = data.drop(columns=drop_columns, axis=1) values = data.values X = values[:, 1:] y = values[:, 0] #print(data.head,X, y) #return 0 #Scalers sc = StandardScaler() min_max_scaler = preprocessing.MinMaxScaler() max_abs_scaler = preprocessing.MaxAbsScaler() #X = sc.fit_transform(X) #PCA b_pca = False b_sne = True if b_pca: model = PCA(n_components=n_components) if b_sne: model = MDS(n_components=n_components) #TSNE X_new = model.fit_transform(X, y) if b_pca: print('PCA ratio', n_components, 'components', model.explained_variance_ratio_) #X_new = sc.fit_transform(X_new) #X_new = preprocessing.scale(X_new) if b_pca: X_new = max_abs_scaler.fit_transform(X_new) #return 0 #tdf = pd.DataFrame(data=X_new, columns=['PC1', 'PC2'], index=data.index) tdf['PC1'] = X_new[:, 0] tdf['PC2'] = X_new[:, 1] #finalDf = pd.concat([tdf, data[['idgram']]], axis=1) df_groups = tdf.groupby('idgram').count() #print(df_groups) #return 0 tdf['counts'] = 0 for index, serie in tdf.iterrows(): n_idgram = tdf.at[index, 'idgram'] tdf.at[index, 'counts'] = df_groups[df_groups.index == n_idgram]['gram'] tdf = tdf.sort_values(by=['counts'], ascending=False) #print(tdf) #Draw i = 0 N = df_groups.shape[0] s_title = '' if b_pca: s_title = '2 component PCA. Точность %s' % (round( sum(float(i) for i in model.explained_variance_ratio_), 2)) if b_sne: s_title = 't-SNE' #Plotly if False: #Plotly py.sign_in('shashmaxus', 'AdfwTulrOoV3cSlbZT3B') c = [ 'hsl(' + str(h) + ',50%' + ',50%)' for h in np.linspace(0, 360, N) ] data_trace = [] for index, row in df_groups.iterrows(): #print(index) df_trace = tdf[tdf['idgram'] == index] #print(df_trace) g_trace = go.Scatter( x=df_trace['PC1'].values, y=df_trace['PC2'].values, name=df_trace['gram'].values[0], mode='markers', #'markers+text' marker=dict( size=8, color=i, #c[i] opacity=0.8, colorscale='Viridis'), text=df_trace['word'], textfont=dict(family='sans serif', size=12)) data_trace.append(g_trace) i += 1 layout = go.Layout( title=s_title_pca, xaxis=dict( title=('Component 1. Вклад %s' % (round(pca.explained_variance_ratio_[0], 2)))), yaxis=dict( title=('Component 2. Вклад %s' % (round(pca.explained_variance_ratio_[1], 2))))) fig2 = go.Figure(data=data_trace, layout=layout) py.image.save_as(fig2, filename='c:/prj/mlivos_data/temp/Words2.png') #Bokeh if True: palette = d3['Category20'][len(tdf['gram'].unique())] #palette = all_palettes['Category20'][len(tdf['gram'].unique())] #palette = Viridis256[len(tdf['gram'].unique())] #palette = Viridis256 color_map = CategoricalColorMapper(factors=tdf['gram'].unique(), palette=palette) #print(mapper) fig = figure(title=s_title, toolbar_location=None) source = ColumnDataSource(tdf[['gram', 'PC1', 'PC2']]) fig.scatter(x='PC1', y='PC2', size=12, color={ 'field': 'gram', 'transform': color_map }, legend='gram', source=source) show(fig) export_png(fig, filename="c:/prj/mlivos_data/temp/PCA.png") return 0
def preprocessor(self, text, max_words=0): env = Environment() t_start = timer() text2 = text.lower() env.debug(1, ['Analyzer', 'preprocessor', 'START Preprocessing:']) tokenizer = RegexpTokenizer(self.word_tokenizers_custom()) tokens_words = tokenizer.tokenize(text2) # Слова текста tokens_sent = sent_tokenize( text2) # Предложения - пока не используются в нашем проекте n_words_count = len(tokens_words) # Количество слов в тексте n_sent_count = len(tokens_sent) # Количество предложений в тексте n_sent_len_mean = n_words_count / n_sent_count # Средняя длина предложения в словах #Делим текст на части - chunks awords = [] #Массив # Если документ большой, разделяем его на несколько частей (chunks) и считаем # статистику для каждого в отдельности. # Это нам позволит имея небольшое число объёмных документов корректно обучить модель if (max_words > 0): n_sent_chunk = int( max_words // n_sent_len_mean ) #Сколько предложение в 1 chunks содержащее max_words print('n_sent_chunk', n_sent_chunk) #подбираем, чтобы текст был разделен равномерно i_chunks = 1 tmp_sent_chunk = n_sent_count while tmp_sent_chunk > n_sent_chunk: i_chunks = i_chunks + 1 tmp_sent_chunk = int( math.ceil(n_sent_count // i_chunks) + (n_sent_count % i_chunks)) n = 0 n_sent_chunk = tmp_sent_chunk #итоговое значение сколько предложений пойдет в chunk print('tmp_sent_chunk', tmp_sent_chunk) while n < n_sent_count: #print(n, n_sent_chunk) asents = tokens_sent[ n:n + n_sent_chunk] #Предложения от n до n+chunk #print(asents) a_sent_words = [] #слова текущей группы предложений for sent in asents: words = tokenizer.tokenize(sent) a_sent_words.extend(words) #print(a_sent_words) awords.append([ n_sent_count, n_words_count, len(a_sent_words) / len(asents), len(asents), len(a_sent_words), a_sent_words ]) n = n + n_sent_chunk else: awords.append([ n_sent_count, n_words_count, n_sent_len_mean, len(tokens_sent), len(tokens_words), tokens_words ]) #print(awords) t_end = timer() env.debug(1, ['Preprocessed:', 'time:', env.job_time(t_start, t_end)]) return awords #Массив со словами и статистикой