def ru_words_test(): lang = 'russian' if lang == 'russian': reader = Reader('elsevier journal.pdf', generateWordList=True) #, language='russian') reader.loadFile() words, lines, articles = reader.parseDocPages( 19, 21) #(14, 17)#0, 50)#()#2160, 2190)#0, 50) for i in range(11): print(i) if i != 6: print(' '.join( [x['text'] for x in words if x['block_count'] == i])) else: for j in range(15): print(j, ' sentence') print(' '.join([ x['text'] for x in words if x['block_count'] == i and x['sentence_count'] == j ])) print("--=-") else: df_source = loadArticlesInEnglish()
def list_w_p(path, mass): '''Return a list of work-power datas caculated from\ the .txt files within the path.\ Argument:\ mass: mass or surface area or volume of the sample''' instances = [] for s in os.listdir(path): # find the .txt files if '.txt' in s: f_path = os.path.join(path, s) inst = rd.GV(filename=f_path, mass=mass) instances.append(inst) # the list for line objects in an axe w_p = pd.DataFrame() for counter, inst in enumerate(instances): w_p = w_p.append(inst.work_power()) return w_p.reset_index(drop=True)
def lemmaEng(): lang = 'english' #lang = 'russian' partMatchesCounted = False #if true then custom, else rough precision recall num = 8 if lang == 'russian': reader = Reader('elsevier journal.pdf') #, language='russian') reader.loadFile() words, lines, articles = reader.parseDocPages() #2160, 2190)#0, 50) print(len(articles)) df = reader.getArticlesDataframe() else: df = loadArticlesInEnglish() df['text'] = '' processor = Preprocessor(stopwordsList=None, lang=lang) if (lang == 'russian'): df['text'] = df['Source Text'] #for index, row in tqdm(df.iterrows(), total=df.shape[0]): # df.at[index, 'text'] = processor.preprocess_text(row['Source Text'], # useLemmas = False) else: df['text'] = df['Source Text'] subset = df.iloc[0:1, :] for index, row in tqdm(subset.iterrows(), total=subset.shape[0]): text_lemma_sw = processor.preprocess_text(row['Source Text'], removeStopWords=True, useLemmas=True) text_lemma = processor.preprocess_text(row['Source Text'], removeStopWords=False, useLemmas=True) subset.at[index, 'text_lemma_sw'] = text_lemma_sw subset.at[ index, 'text_lemma'] = text_lemma #processor.preprocess_text(row['Source Text'], removeStopWords=False, #useLemmas=True) source = row['Source Text']
def object_generator(path, mass): for s in os.listdir(path): # find the .txt files if '.txt' in s: f_path = os.path.join(path, s) yield rd.CV(filename=f_path, mass=mass)
def multi_cv( ax, path, title=None, mass=1.0, color_set=['black', 'blue', 'green', 'red', 'pink', 'brown', 'yellow'], xlabel=None, ylabel=None, filter_on=False, smooth_range=None, legend_font=12, **kwargs): '''Draw multiple cv curves on a single axe\ Arguments:\ ax\ path: the path of the file containing .txt files\ title\ mass: mass or surface area or volume of the sample''' gp.Update_axe(ax, title=title, **kwargs) # the list for Dataframe objects cv = [] for s in os.listdir(path): # find the .txt files if '.txt' in s: f_path = os.path.join(path, s) obj = rd.CV(filename=f_path, mass=mass) cv.append(obj) # the list for line objects in an axe for counter, obj in enumerate(cv): # if filter is on if filter_on == True and smooth_range is not None: gp.MultiLine(smoother_cv(obj, smooth_range), ax, label=obj.scan_rate, color=color_set[counter]) # or not... else: gp.MultiLine(obj, ax, label=obj.scan_rate, color=color_set[counter]) if xlabel == None: ax.set_xlabel(cv[0].columns[0]) else: ax.set_xlabel(xlabel) if ylabel == None: ax.set_ylabel(cv[0].columns[1]) else: ax.set_ylabel(ylabel) gp.Auto_legend(ax, bbox_to_anchor=(0, 1.03), loc='upper left', fontsize=legend_font)
def multi_c_d( ax, path, title=None, mass=1.0, unit=' A/g', color_set=['black', 'blue', 'green', 'red', 'pink', 'brown', 'yellow'], xlabel=None, ylabel=None, legend_font=12, b_to_a=(1, 0.75), **kwargs): '''Draw multiple charge/discharge curves on a single axe\ Arguments:\ ax\ path: the path of the file containing .txt files\ title\ mass: mass or surface area or volume of the sample''' gp.Update_axe(ax, title=title, **kwargs) # the list for GV objects instances = [] for s in os.listdir(path): # find the .txt files if '.txt' in s: f_path = os.path.join(path, s) inst = rd.GV(filename=f_path, mass=mass) instances.append(inst) # the list for line objects in an axe handles = [] for counter, inst in enumerate(instances): l_char = gp.MultiLine(inst.charge_curve(), ax, label=str(inst.current) + unit, color=color_set[counter]) gp.MultiLine(inst.discharge_curve(), ax, label=str(inst.current) + unit, color=color_set[counter]) # line object is a list handles += l_char if xlabel == None: ax.set_xlabel(instances[0].columns[0]) else: ax.set_xlabel(xlabel) if ylabel == None: ax.set_ylabel(instances[0].columns[1]) else: ax.set_ylabel(ylabel) ax.set_xlim(left=0) ax.set_ylim(top=0.8, bottom=0) ax.legend(handles=handles, bbox_to_anchor=b_to_a, loc='upper right', frameon=False, fontsize=legend_font)
def test_ru_graph_keys_words_numbers(): lang = 'russian' reader = Reader('elsevier journal.pdf') #, language='russian') reader.loadFile() words, lines, articles = reader.parseDocPages( ) #0, 50)#()#2160, 2190)#0, 50) print(len(articles)) df_source = reader.getArticlesDataframe() df = df_source.copy() df['text'] = '' df['noun_phrases_num'] = 0 processor = Preprocessor(stopwordsList=None, lang=lang) sw = processor.stopwords morph = pymorphy2.MorphAnalyzer() for index, row in tqdm(df.iterrows(), total=df.shape[0]): text = processor.preprocess_text(row['Source Text'], removeStopWords=True, useLemmas=False) df.at[index, 'text'] = text pos = [morph.parse(w)[0].tag.POS for w in re.findall(r"[\w']+", text)] count = Counter(pos) df.at[ index, 'noun_phrases_num'] = count['NOUN'] + count['ADJF'] + count['PRTF'] df['keys_phrases_num'] = df.apply( lambda row: len(row['Keywords'].split(',')), axis=1) df['keys_words_num'] = df.apply(lambda row: len( (re.findall(r"[\w']+", row['Keywords']))), axis=1) df['words_num_sw_incl'] = df.apply(lambda row: len( (re.findall(r"[\w']+", row['Source Text']))), axis=1) df['words_num'] = df.apply(lambda row: len( (re.findall(r"[\w']+", row['text']))), axis=1) stats = df[[ 'keys_phrases_num', 'keys_words_num', 'words_num', 'words_num_sw_incl', 'noun_phrases_num' ]] stats.to_excel("noun_phrases_num.xlsx") #stats.to_excel("keys_words_number_stats.xlsx") lst_phrases = stats['keys_phrases_num'].tolist() lst_keys = stats['keys_words_num'].tolist() lst_words = stats['words_num'].tolist() lst_words_num_sw_incl = stats['words_num_sw_incl'].tolist() lst_noun_phrases_num = stats['noun_phrases_num'].tolist() x = list(range(1, len(lst_words) + 1)) plt.figure() plt.plot(x, lst_phrases) # Show/save figure as desired. title = 'Количество ключевых фраз' plt.title(title) # Добавляем подписи к осям: plt.xlabel("Номер статьи") plt.ylabel('Количество ключевых фраз') plt.savefig(title + " lst_phrases" + ".png", bbox_inches='tight') plt.show() plt.figure() plt.plot(x, lst_keys) # Show/save figure as desired. title = 'Количество ключевых слов' plt.title(title) # Добавляем подписи к осям: plt.xlabel("Номер статьи") plt.ylabel('Количество ключевых слов') plt.savefig(title + " lst_keys" + ".png", bbox_inches='tight') plt.show() plt.figure() plt.plot(x, lst_words_num_sw_incl) # Show/save figure as desired. title = 'Количество слов в текстах' plt.title(title) # Добавляем подписи к осям: plt.xlabel("Номер статьи") plt.ylabel('Количество слов в тексте статьи') plt.savefig(title + " words_num_sw_incl" + ".png", bbox_inches='tight') plt.show() plt.figure() plt.plot(x, lst_words) # Show/save figure as desired. title = 'Количество слов в текстах (без стопслов)' plt.title(title) # Добавляем подписи к осям: plt.xlabel("Номер статьи") plt.ylabel('Количество слов в тексте статьи') plt.savefig(title + " lst_words" + ".png", bbox_inches='tight') plt.show() plt.figure() plt.plot(x, lst_noun_phrases_num) # Show/save figure as desired. title = 'Количество существительных, прилагательных и причастий в текстах' plt.title(title) # Добавляем подписи к осям: plt.xlabel("Номер статьи") plt.ylabel('Количество слов') plt.savefig(title + " lst_noun_phrases_num" + ".png", bbox_inches='tight') plt.show() plt.figure() plt.plot(x, lst_words_num_sw_incl, 'b', label='С учетом стопслов') plt.plot(x, lst_words, 'g', label='Без учета стопслов') plt.plot(x, lst_noun_phrases_num, 'y', label='Существительные, прилагательные и причастия') # Show/save figure as desired. title = 'Сравнение количества слов в текстах' plt.title(title) # Добавляем подписи к осям: plt.xlabel("Номер статьи") plt.ylabel('Количество слов') plt.savefig(title + " compare_words_num_on_sw_pos" + ".png", bbox_inches='tight') plt.show()
def SimpleNN(): warnings.filterwarnings("ignore") lang = 'russian' nltk.download("stopwords") lst_stopwords = stopwords.words("russian") alphabet = "АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя" alphabet += "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" lst_stopwords.extend(list(alphabet)) if lang == 'russian': reader = Reader( 'elsevier journal.pdf', generateWordList=True, additional_stopwords=lst_stopwords) #, language='russian') reader.loadFile() words, lines, articles = reader.parseDocPages(4, 2190) #19, 400) #1) df = pd.DataFrame.from_dict(words) cat_features = [ 'size', 'flags', 'font', #'color', #'otherSigns', 'morph_pos', 'morph_animacy', 'morph_aspect', 'morph_case', 'morph_gender', 'morph_involvement', 'morph_mood', 'morph_number', 'morph_person', 'morph_tense', 'morph_transitivity', 'morph_voice' ] all_columns = list(df.columns) df = pd.concat([ pd.get_dummies(df[col], prefix=col) if col in cat_features and col != 'otherSigns' else df[col] for col in all_columns ], axis=1) values = [',', '.', '\)', '\(', '\[', '\]'] for value in values: df[str('otherSigns' + '_' + value)] = np.where( df['otherSigns'].str.contains(value), "1", "0") df = df.drop(['otherSigns'], axis=1) print('all columns') for col in all_columns: print(col) print() print('new columns') for col in df.columns: print(col) df.head() featuresToRemove = [ 'text', 'morph_normalform', 'morph_lexeme', 'span_count', 'line_count', 'block_count', #'sentence_count', 'page_num', 'article_num', 'color' ] #featuresToRemove = [] df = df.drop(featuresToRemove, axis=1) # convert all columns of DataFrame #df = df.apply(pd.to_numeric) print("df.dtypes") print(df.dtypes) df.to_csv('all_words_features.csv', index=False) numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] newdf = df.select_dtypes(include=numerics) print("newdf.columns") print(newdf.columns) #df_y = newdf.pop('is_stopword') keywords = newdf[newdf['is_keyword'] == 1] print('len of keywords') print(keywords.shape) keywords.to_csv('all_keywords_features.csv', index=False) try: notkeywords = newdf[newdf['is_keyword'] == 0].sample( n=len(keywords), replace=False) except ValueError: notkeywords = newdf[newdf['is_keyword'] == 0].sample( n=len(keywords), replace=True) print('len of notkeywords') print(notkeywords.shape) notkeywords.to_csv('all_notkeywords_features.csv', index=False) newdf = pd.concat([notkeywords, keywords]) newdf = newdf.sample(frac=1) df_Y = newdf.pop('is_keyword') Y = df_Y.values X = newdf.values.astype(float) print('X.shape') print(X.shape) # evaluate model with standardized dataset estimator = KerasClassifier(build_fn=create_baseline, input_dim=len(newdf.columns), epochs=100, batch_size=5, verbose=0) kfold = StratifiedKFold(n_splits=10, shuffle=True) results = cross_val_score(estimator, X, Y, cv=kfold, scoring='f1') print("Baseline: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100)) #Baseline: 86.09% (4.30%) #Baseline: 87.93% (1.14%) #Baseline: 71.70% (3.28%) #Baseline: 74.48% (1.10%) а1 print('bye-bye')
def main(): #lang = 'english' lang = 'russian' partMatchesCounted = False #if true then custom, else rough precision recall num = 8 if lang == 'russian': reader = Reader('elsevier journal.pdf') #, language='russian') reader.loadFile() #pages = reader.readPages(); ##pages = reader.readPages(0, None, 'НАПРАВЛЕНИЕ 1', 'НАПРАВЛЕНИЕ 2', ##caseSensitiveSearch=True) ##pages = reader.readPages(0, None, 'НАПРАВЛЕНИЕ 1', 'НАПРАВЛЕНИЕ 2', ##caseSensitiveSearch=False) ##pages = reader.readPages(start=2, end=10, startString=None, endString=None, ##debug=True) ##pages = reader.readPages(start=4, end=4, startString=None, endString=None, ##debug=True) #print(len(pages)) words, lines, articles = reader.parseDocPages() #2160, 2190)#0, 50) print(len(articles)) df = reader.getArticlesDataframe() else: df = loadArticlesInEnglish() df['text'] = '' processor = Preprocessor(stopwordsList=None, lang=lang) if (lang == 'russian'): df['text'] = df['Source Text'] #for index, row in tqdm(df.iterrows(), total=df.shape[0]): # df.at[index, 'text'] = processor.preprocess_text(row['Source Text'], # useLemmas = False) else: df['text'] = df['Source Text'] #for index, row in tqdm(df.iterrows(), total=df.shape[0]): #df.at[index, 'text'] = processor.preprocess_text(row['Source Text'], #removeStopWords=False, useLemmas=False) print(df.head()) rakeExtractor.extractKeywords(df, num=num, metricsCount=True, partMatchesCounted=partMatchesCounted) print(df.head()) #print(tabulate(df, headers='keys', tablefmt='psql')) kw = df['Keywords'].values #print(kw) #print(df.Keywords) tfidfBlobExtractor = TfIdfBlobExtractor(processor.getStopwords()) tfidfBlobExtractor.extractKeywords(df, num=num, metricsCount=True, partMatchesCounted=partMatchesCounted) #print(df.head()) textRankExtractor = TextRankExtractor(processor.getStopwords(), language=lang) textRankExtractor.extractKeywords( df, num=num, metricsCount=True, partMatchesCounted=partMatchesCounted) #False) #print(df.head()) #print(tabulate(df.head(), headers='keys', tablefmt='psql')) x = [] y = { 'rake': { 'precision': [], 'recall': [], 'f1': [] }, 'textrank': { 'precision': [], 'recall': [], 'f1': [] }, 'tfidf': { 'precision': [], 'recall': [], 'f1': [] } } for index, row in tqdm(df.iterrows(), total=df.shape[0]): x.append(index) values = row['textrank_metrics'].split(',') y['textrank']['precision'].append(values[0]) y['textrank']['recall'].append(values[1]) y['textrank']['f1'].append(values[2]) values = row['tfidf_blob_metrics'].split(',') y['tfidf']['precision'].append(values[0]) y['tfidf']['recall'].append(values[1]) y['tfidf']['f1'].append(values[2]) values = row['rake_metrics'].split(',') y['rake']['precision'].append(values[0]) y['rake']['recall'].append(values[1]) y['rake']['f1'].append(values[2]) #plt.plot(x, y['textrank']['precision'], 'g^', x, y['textrank']['recall'], #'g-') #fig, ax = plt.subplots() #bar_values = [statistics.mean(list(map(float, y['textrank']['precision']))), # statistics.mean(list(map(float, y['rake']['precision']))), # statistics.mean(list(map(float, y['tfidf']['precision'])))] #bar_label = bar_values #bar_plot = plt.bar(['textrank', 'rake', 'tf-idf'], bar_values) #autolabel(ax, bar_plot, bar_label) #plt.ylim(0,max(bar_label) * 1.5) #plt.title('Quality metrics for ' + lang + ' language') #plt.savefig("add_text_bar_matplotlib_01.png", bbox_inches='tight') #plt.show() metrics = ['precision', 'recall', 'f1'] for i in range(len(metrics)): fig, ax = plt.subplots() bar_values = [ statistics.mean(list(map(float, y['textrank'][metrics[i]]))), statistics.mean(list(map(float, y['rake'][metrics[i]]))), statistics.mean(list(map(float, y['tfidf'][metrics[i]]))) ] bar_label = bar_values bar_plot = plt.bar(['textrank', 'rake', 'tf-idf'], bar_values) autolabel(ax, bar_plot, bar_label) plt.ylim(0, max(max(bar_label) * 1.5, 0.01)) title = 'Metric ' + str(metrics[i] + ' for ' + str(num) + ' found keywords based on data in ' + lang + ' (partMatchesCounted = ' + str(partMatchesCounted) + ')') plt.title(title) #'Quality metrics for ' + lang + ' language') plt.savefig(title + ".png", bbox_inches='tight') plt.show()
def lemmaRuWithQuality(): lang = 'russian' if lang == 'russian': reader = Reader('elsevier journal.pdf') #, language='russian') reader.loadFile() words, lines, articles = reader.parseDocPages( ) #0, 50)#()#2160, 2190)#0, 50) print(len(articles)) df_source = reader.getArticlesDataframe() else: df_source = loadArticlesInEnglish() #partMatchesCounted = False #if true then custom, else rough precision recall #num = 8; #removeStopWords = False conditions = list( itertools.product( [False, True], #[False, True], #removeStopWords [ False, True ], #partMatchesCounted #if true then custom, else rough precision recall [4, 8] #num of keywords )) df = None condition_i = 0 for condition in conditions: removeStopWords = condition[0] partMatchesCounted = condition[ 1] #if true then custom, else rough precision recall num = condition[2] print('condition: ', condition) if df is None: print('Read DF') df = df_source.copy() df['text'] = '' processor = Preprocessor(stopwordsList=None, lang=lang) sw = processor.stopwords #processor.stopwords = processor.get_normal_form_list(sw) processor.stopwords.extend(processor.get_normal_form_list(sw)) if (lang == 'russian'): #df['text'] = df['Source Text'] for index, row in tqdm(df.iterrows(), total=df.shape[0]): df.at[index, 'text'] = processor.preprocess_text( row['Source Text'], removeStopWords=removeStopWords, useLemmas=True, applyLemmasToText=True) else: for index, row in tqdm(df.iterrows(), total=df.shape[0]): text = processor.preprocess_text( row['Source Text'], removeStopWords=removeStopWords, useLemmas=True) df.at[index, 'text'] = text elif condition_i == 4: for index, row in tqdm(df.iterrows(), total=df.shape[0]): text = processor.preprocess_text(row['text'], removeStopWords=True, useLemmas=True, applyLemmasToText=False) df.at[index, 'text'] = text condition_i = condition_i + 1 rakeExtractor.extractKeywords(df, num=num, metricsCount=True, partMatchesCounted=partMatchesCounted, textprocessor=processor) tfidfBlobExtractor = TfIdfBlobExtractor(processor.getStopwords(), textprocessor=processor) tfidfBlobExtractor.extractKeywords( df, num=num, metricsCount=True, partMatchesCounted=partMatchesCounted) textRankExtractor = TextRankExtractor(processor.getStopwords(), language=lang, textprocessor=processor) textRankExtractor.extractKeywords( df, num=num, metricsCount=True, partMatchesCounted=partMatchesCounted) x = [] y = { 'rake': { 'precision': [], 'recall': [], 'f1': [] }, 'textrank': { 'precision': [], 'recall': [], 'f1': [] }, 'tfidf': { 'precision': [], 'recall': [], 'f1': [] } } for index, row in tqdm(df.iterrows(), total=df.shape[0]): x.append(index) values = row['textrank_metrics'].split(',') y['textrank']['precision'].append(values[0]) y['textrank']['recall'].append(values[1]) y['textrank']['f1'].append(values[2]) values = row['tfidf_blob_metrics'].split(',') y['tfidf']['precision'].append(values[0]) y['tfidf']['recall'].append(values[1]) y['tfidf']['f1'].append(values[2]) values = row['rake_metrics'].split(',') y['rake']['precision'].append(values[0]) y['rake']['recall'].append(values[1]) y['rake']['f1'].append(values[2]) metrics = ['precision', 'recall', 'f1'] for i in range(len(metrics)): fig, ax = plt.subplots() bar_values = [ statistics.mean(list(map(float, y['textrank'][metrics[i]]))), statistics.mean(list(map(float, y['rake'][metrics[i]]))), statistics.mean(list(map(float, y['tfidf'][metrics[i]]))) ] bar_label = [round(bv, 2) for bv in bar_values] #add round bar_plot = plt.bar(['textrank', 'rake', 'tf-idf'], bar_values) autolabel(ax, bar_plot, bar_label) plt.ylim(0, max(max(bar_label) * 1.5, 0.01)) title = ( 'Metric ' + str(metrics[i] + ' for ' + str(num) + ' found keywords based on lemmatizied data in ' + lang + ' (partMatchesCounted = ' + str(partMatchesCounted) + ')' + ' (removeStopWords = ' + str(removeStopWords) + ')')) plt.title(title) #'Quality metrics for ' + lang + ' language') plt.savefig(title + ".png", bbox_inches='tight') #if want to show only once upon a program plt.show()