def plot_frequency_analysis_of_words(analyzer, is_constraint=None): """Построение частотности слов во всем корпусе""" if not is_constraint: for i in range(10): data = pd.read_csv(generate_file_name_with_postfix( analyzer.config['articles_frequency_info_file'], str(i)), delimiter=',') data.plot(x='word', y='frequency', figsize=(50, 7), kind='scatter') plt.xticks(rotation=60) plt.show() else: for i in range(10): data = pd.read_csv(generate_file_name_with_postfix( analyzer. config['articles_frequency_info_file_with_constraint'], str(i)), delimiter=',') data = data.sort_values(by='frequency', axis='index') data.plot(x='word', y='frequency', figsize=(50, 7), kind='scatter') plt.xticks(rotation=60) plt.show()
def save_syntax_analysis(analyzer): for codex_id in tqdm(range(len(analyzer.parser.codex_urls))): target_words_info = pd.read_csv( generate_file_name_with_postfix( analyzer.config['information_about_target_words'], str(codex_id))) for row in tqdm(target_words_info.itertuples()): for sentence in row[-1].split('~'): analyzer.save_syntax_analysis_by_text( sentence, generate_file_name_with_postfix( analyzer. config['article_target_words_realation_info'], str(row[1])))
def links_on_target_words_analysis(self): for codex_id in tqdm(range(len(self.parser.codex_urls))): if os.path.exists( generate_file_name_with_postfix( self. config['information_about_target_words_with_links'], str(codex_id))): os.remove( generate_file_name_with_postfix( self. config['information_about_target_words_with_links'], str(codex_id))) with open(generate_file_name_with_postfix( self.config['information_about_target_words_with_links'], str(codex_id)), mode='w') as information_about_target_words_file: information_about_target_words_writer = csv.writer( information_about_target_words_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) information_about_target_words_writer.writerow([ 'article_id', 'article_title', 'article_url', 'links_on_target' ]) target_words_info = pd.read_csv( generate_file_name_with_postfix( self.config['information_about_target_words'], str(codex_id))) for row in tqdm(target_words_info.itertuples()): links_on_target = list() for part_of_target_words in row[3].split('~'): if self.parser.get_links_on_target_words_by_id_and_target_words( row[1], part_of_target_words): links_on_target.append( self.parser. get_links_on_target_words_by_id_and_target_words( row[1], part_of_target_words)) else: links_on_target.append('None') information_about_target_words_writer.writerow([ row[1], row[2], self.parser.get_article_url_by_id(row[1]), ' '.join(links_on_target) ])
def save_information_about_target_words_by_codex_type( self, codex_type, codex_id): raw_articles_info = self.parser.sorted_articles_info[codex_type] if os.path.exists( generate_file_name_with_postfix( self.config['information_about_target_words'], str(codex_id))): os.remove( generate_file_name_with_postfix( self.config['information_about_target_words'], str(codex_id))) with open(generate_file_name_with_postfix( self.config['information_about_target_words'], str(codex_id)), mode='w') as information_about_target_words_file: information_about_target_words_writer = csv.writer( information_about_target_words_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) information_about_target_words_writer.writerow([ 'article_id', 'article_title', 'parts_after_target_words', 'sentences' ]) for article_info in tqdm(raw_articles_info): text = self.parser.get_article_text_by_id(article_info.id) if text.find('если иное не предусмотрено') != -1: text_parts = text.split('если иное не предусмотрено') parts_before_target_words = list() for i in range(0, len(text_parts) - 1): parts_before_target_words.append( text_parts[i].split('.')[-1]) parts_after_target_words = list() for i in range(1, len(text_parts)): parts_after_target_words.append( text_parts[i].split('.')[0]) sentences = list() for i in range(len(parts_before_target_words)): sentences.append(parts_before_target_words[i] + 'если иное не предусмотрено' + parts_after_target_words[i]) information_about_target_words_writer.writerow([ article_info.id, article_info.title, '~'.join(parts_after_target_words), '~'.join(sentences) ])
def plot_unique_words_in_articles_analysis_on_one_graph(analyzer): """График частотности уникальных слов в каждом кодексе на одном графике с отсортированной частотностью""" data = pd.read_csv(generate_file_name_with_postfix( analyzer.config['unique_words_in_articles_analysis_file'], str(0)), delimiter=',') for i in range(1, 10): data = pd.concat([ data, pd.read_csv(generate_file_name_with_postfix( analyzer.config['unique_words_in_articles_analysis_file'], str(i)), delimiter=',') ]) data['article_id'] = data.apply( lambda row: row['article_id'] / data['article_id'].max(), axis=1) data = data.sort_values('unique_words_frequency') data = data.reset_index() data.drop('article_id', axis='columns', inplace=True) data.drop('index', axis='columns', inplace=True) data.plot() plt.show()
def plot_unique_words_in_articles_analysis(analyzer): """Графики частотности уникальных слов в каждом кодексе по article_id""" for i in range(10): data = pd.read_csv(generate_file_name_with_postfix( analyzer.config['unique_words_in_articles_analysis_file'], str(i)), delimiter=',') data = data.sort_values('unique_words_frequency') data.plot(x='article_id', y='unique_words_frequency', kind='scatter') plt.show()
def save_unique_words_in_articles_analysis(self, codex_type, codex_id): raw_articles_info = self.parser.sorted_articles_info[codex_type] articles_info = list() for article_info in tqdm(raw_articles_info): text = self.parser.get_article_text_by_id(article_info.id) text = text.lower() text = self.remove_chars_from_text(text, self.spec_chars) article_tokens = word_tokenize(' '.join( self.mystem.lemmatize(text))) for stop_word in self.stop_words: while stop_word in article_tokens: article_tokens.remove(stop_word) text = Text(article_tokens) f_dist = FreqDist(text) f_dist = list(filter(lambda item: item[1] == 1, f_dist.items())) articles_info.append( (article_info.id, len(f_dist) / len(article_tokens))) if os.path.exists( generate_file_name_with_postfix( self.config['unique_words_in_articles_analysis_file'], str(codex_id))): os.remove( generate_file_name_with_postfix( self.config['unique_words_in_articles_analysis_file'], str(codex_id))) with open(generate_file_name_with_postfix( self.config['unique_words_in_articles_analysis_file'], str(codex_id)), mode='w') as unique_words_in_articles_analysis_file: unique_words_in_articles_analysis_writer = csv.writer( unique_words_in_articles_analysis_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) unique_words_in_articles_analysis_writer.writerow( ['article_id', 'unique_words_frequency']) for frequency_info in articles_info: unique_words_in_articles_analysis_writer.writerow( [frequency_info[0], frequency_info[1]])
def save_syntax_analysis_in_links(analyzer): for codex_id in tqdm(range(len(analyzer.parser.codex_urls))): target_words_info = pd.read_csv( generate_file_name_with_postfix( analyzer. config['information_about_target_words_with_links'], str(codex_id))) for row in tqdm(target_words_info.itertuples()): for url in row[-1].split(' '): if url != 'None': analyzer.save_syntax_analysis_by_text( analyzer.parser.get_text_by_url(url), generate_file_name_with_postfix( analyzer.config[ 'article_target_words_in_links_realation_info'], str(row[1])), is_many_sentences=True) else: analyzer.save_syntax_analysis_by_text( 'None', generate_file_name_with_postfix( analyzer.config[ 'article_target_words_in_links_realation_info'], str(row[1])))
def save_codex_hist_info(self, codex_type, codex_id, constraint=None): """Сохранение частотности слов во всем корпусе""" raw_articles_info = self.parser.sorted_articles_info[codex_type] articles_tokens = list() for article_info in tqdm(raw_articles_info): text = self.parser.get_article_text_by_id(article_info.id) text = text.lower() text = self.remove_chars_from_text(text, self.spec_chars) article_tokens = word_tokenize(' '.join( self.mystem.lemmatize(text))) for stop_word in self.stop_words: while stop_word in article_tokens: article_tokens.remove(stop_word) articles_tokens.extend(article_tokens) text = Text(articles_tokens) f_dist = FreqDist(text) if not constraint: if os.path.exists( generate_file_name_with_postfix( self.config['articles_frequency_info_file'], str(codex_id))): os.remove( generate_file_name_with_postfix( self.config['articles_frequency_info_file'], str(codex_id))) with open(generate_file_name_with_postfix( self.config['articles_frequency_info_file'], str(codex_id)), mode='w') as articles_frequency_info_file: articles_frequency_info_writer = csv.writer( articles_frequency_info_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) articles_frequency_info_writer.writerow(['word', 'frequency']) for frequency_info in f_dist.most_common(100): articles_frequency_info_writer.writerow([ frequency_info[0], frequency_info[1] / len(articles_tokens) ]) else: if os.path.exists( generate_file_name_with_postfix( self. config['articles_frequency_info_file_with_constraint'], str(codex_id))): os.remove( generate_file_name_with_postfix( self. config['articles_frequency_info_file_with_constraint'], str(codex_id))) with open(generate_file_name_with_postfix( self. config['articles_frequency_info_file_with_constraint'], str(codex_id)), mode='w') as articles_frequency_info_file: articles_frequency_info_writer = csv.writer( articles_frequency_info_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) articles_frequency_info_writer.writerow(['word', 'frequency']) f_dist = list( filter(lambda item: item[1] > constraint, f_dist.items())) for frequency_info in f_dist: articles_frequency_info_writer.writerow([ frequency_info[0], frequency_info[1] / len(articles_tokens) ])