Пример #1
0
 def plot_frequency_analysis_of_words(analyzer, is_constraint=None):
     """Построение частотности слов во всем корпусе"""
     if not is_constraint:
         for i in range(10):
             data = pd.read_csv(generate_file_name_with_postfix(
                 analyzer.config['articles_frequency_info_file'], str(i)),
                                delimiter=',')
             data.plot(x='word',
                       y='frequency',
                       figsize=(50, 7),
                       kind='scatter')
             plt.xticks(rotation=60)
             plt.show()
     else:
         for i in range(10):
             data = pd.read_csv(generate_file_name_with_postfix(
                 analyzer.
                 config['articles_frequency_info_file_with_constraint'],
                 str(i)),
                                delimiter=',')
             data = data.sort_values(by='frequency', axis='index')
             data.plot(x='word',
                       y='frequency',
                       figsize=(50, 7),
                       kind='scatter')
             plt.xticks(rotation=60)
             plt.show()
Пример #2
0
 def save_syntax_analysis(analyzer):
     for codex_id in tqdm(range(len(analyzer.parser.codex_urls))):
         target_words_info = pd.read_csv(
             generate_file_name_with_postfix(
                 analyzer.config['information_about_target_words'],
                 str(codex_id)))
         for row in tqdm(target_words_info.itertuples()):
             for sentence in row[-1].split('~'):
                 analyzer.save_syntax_analysis_by_text(
                     sentence,
                     generate_file_name_with_postfix(
                         analyzer.
                         config['article_target_words_realation_info'],
                         str(row[1])))
Пример #3
0
 def links_on_target_words_analysis(self):
     for codex_id in tqdm(range(len(self.parser.codex_urls))):
         if os.path.exists(
                 generate_file_name_with_postfix(
                     self.
                     config['information_about_target_words_with_links'],
                     str(codex_id))):
             os.remove(
                 generate_file_name_with_postfix(
                     self.
                     config['information_about_target_words_with_links'],
                     str(codex_id)))
         with open(generate_file_name_with_postfix(
                 self.config['information_about_target_words_with_links'],
                 str(codex_id)),
                   mode='w') as information_about_target_words_file:
             information_about_target_words_writer = csv.writer(
                 information_about_target_words_file,
                 delimiter=',',
                 quotechar='"',
                 quoting=csv.QUOTE_MINIMAL)
             information_about_target_words_writer.writerow([
                 'article_id', 'article_title', 'article_url',
                 'links_on_target'
             ])
             target_words_info = pd.read_csv(
                 generate_file_name_with_postfix(
                     self.config['information_about_target_words'],
                     str(codex_id)))
             for row in tqdm(target_words_info.itertuples()):
                 links_on_target = list()
                 for part_of_target_words in row[3].split('~'):
                     if self.parser.get_links_on_target_words_by_id_and_target_words(
                             row[1], part_of_target_words):
                         links_on_target.append(
                             self.parser.
                             get_links_on_target_words_by_id_and_target_words(
                                 row[1], part_of_target_words))
                     else:
                         links_on_target.append('None')
                 information_about_target_words_writer.writerow([
                     row[1], row[2],
                     self.parser.get_article_url_by_id(row[1]),
                     ' '.join(links_on_target)
                 ])
Пример #4
0
 def save_information_about_target_words_by_codex_type(
         self, codex_type, codex_id):
     raw_articles_info = self.parser.sorted_articles_info[codex_type]
     if os.path.exists(
             generate_file_name_with_postfix(
                 self.config['information_about_target_words'],
                 str(codex_id))):
         os.remove(
             generate_file_name_with_postfix(
                 self.config['information_about_target_words'],
                 str(codex_id)))
     with open(generate_file_name_with_postfix(
             self.config['information_about_target_words'], str(codex_id)),
               mode='w') as information_about_target_words_file:
         information_about_target_words_writer = csv.writer(
             information_about_target_words_file,
             delimiter=',',
             quotechar='"',
             quoting=csv.QUOTE_MINIMAL)
         information_about_target_words_writer.writerow([
             'article_id', 'article_title', 'parts_after_target_words',
             'sentences'
         ])
         for article_info in tqdm(raw_articles_info):
             text = self.parser.get_article_text_by_id(article_info.id)
             if text.find('если иное не предусмотрено') != -1:
                 text_parts = text.split('если иное не предусмотрено')
                 parts_before_target_words = list()
                 for i in range(0, len(text_parts) - 1):
                     parts_before_target_words.append(
                         text_parts[i].split('.')[-1])
                 parts_after_target_words = list()
                 for i in range(1, len(text_parts)):
                     parts_after_target_words.append(
                         text_parts[i].split('.')[0])
                 sentences = list()
                 for i in range(len(parts_before_target_words)):
                     sentences.append(parts_before_target_words[i] +
                                      'если иное не предусмотрено' +
                                      parts_after_target_words[i])
                 information_about_target_words_writer.writerow([
                     article_info.id, article_info.title,
                     '~'.join(parts_after_target_words), '~'.join(sentences)
                 ])
Пример #5
0
 def plot_unique_words_in_articles_analysis_on_one_graph(analyzer):
     """График частотности уникальных слов в каждом кодексе на одном графике с отсортированной частотностью"""
     data = pd.read_csv(generate_file_name_with_postfix(
         analyzer.config['unique_words_in_articles_analysis_file'], str(0)),
                        delimiter=',')
     for i in range(1, 10):
         data = pd.concat([
             data,
             pd.read_csv(generate_file_name_with_postfix(
                 analyzer.config['unique_words_in_articles_analysis_file'],
                 str(i)),
                         delimiter=',')
         ])
     data['article_id'] = data.apply(
         lambda row: row['article_id'] / data['article_id'].max(), axis=1)
     data = data.sort_values('unique_words_frequency')
     data = data.reset_index()
     data.drop('article_id', axis='columns', inplace=True)
     data.drop('index', axis='columns', inplace=True)
     data.plot()
     plt.show()
Пример #6
0
 def plot_unique_words_in_articles_analysis(analyzer):
     """Графики частотности уникальных слов в каждом кодексе по article_id"""
     for i in range(10):
         data = pd.read_csv(generate_file_name_with_postfix(
             analyzer.config['unique_words_in_articles_analysis_file'],
             str(i)),
                            delimiter=',')
         data = data.sort_values('unique_words_frequency')
         data.plot(x='article_id',
                   y='unique_words_frequency',
                   kind='scatter')
         plt.show()
Пример #7
0
 def save_unique_words_in_articles_analysis(self, codex_type, codex_id):
     raw_articles_info = self.parser.sorted_articles_info[codex_type]
     articles_info = list()
     for article_info in tqdm(raw_articles_info):
         text = self.parser.get_article_text_by_id(article_info.id)
         text = text.lower()
         text = self.remove_chars_from_text(text, self.spec_chars)
         article_tokens = word_tokenize(' '.join(
             self.mystem.lemmatize(text)))
         for stop_word in self.stop_words:
             while stop_word in article_tokens:
                 article_tokens.remove(stop_word)
         text = Text(article_tokens)
         f_dist = FreqDist(text)
         f_dist = list(filter(lambda item: item[1] == 1, f_dist.items()))
         articles_info.append(
             (article_info.id, len(f_dist) / len(article_tokens)))
     if os.path.exists(
             generate_file_name_with_postfix(
                 self.config['unique_words_in_articles_analysis_file'],
                 str(codex_id))):
         os.remove(
             generate_file_name_with_postfix(
                 self.config['unique_words_in_articles_analysis_file'],
                 str(codex_id)))
     with open(generate_file_name_with_postfix(
             self.config['unique_words_in_articles_analysis_file'],
             str(codex_id)),
               mode='w') as unique_words_in_articles_analysis_file:
         unique_words_in_articles_analysis_writer = csv.writer(
             unique_words_in_articles_analysis_file,
             delimiter=',',
             quotechar='"',
             quoting=csv.QUOTE_MINIMAL)
         unique_words_in_articles_analysis_writer.writerow(
             ['article_id', 'unique_words_frequency'])
         for frequency_info in articles_info:
             unique_words_in_articles_analysis_writer.writerow(
                 [frequency_info[0], frequency_info[1]])
Пример #8
0
 def save_syntax_analysis_in_links(analyzer):
     for codex_id in tqdm(range(len(analyzer.parser.codex_urls))):
         target_words_info = pd.read_csv(
             generate_file_name_with_postfix(
                 analyzer.
                 config['information_about_target_words_with_links'],
                 str(codex_id)))
         for row in tqdm(target_words_info.itertuples()):
             for url in row[-1].split(' '):
                 if url != 'None':
                     analyzer.save_syntax_analysis_by_text(
                         analyzer.parser.get_text_by_url(url),
                         generate_file_name_with_postfix(
                             analyzer.config[
                                 'article_target_words_in_links_realation_info'],
                             str(row[1])),
                         is_many_sentences=True)
                 else:
                     analyzer.save_syntax_analysis_by_text(
                         'None',
                         generate_file_name_with_postfix(
                             analyzer.config[
                                 'article_target_words_in_links_realation_info'],
                             str(row[1])))
Пример #9
0
 def save_codex_hist_info(self, codex_type, codex_id, constraint=None):
     """Сохранение частотности слов во всем корпусе"""
     raw_articles_info = self.parser.sorted_articles_info[codex_type]
     articles_tokens = list()
     for article_info in tqdm(raw_articles_info):
         text = self.parser.get_article_text_by_id(article_info.id)
         text = text.lower()
         text = self.remove_chars_from_text(text, self.spec_chars)
         article_tokens = word_tokenize(' '.join(
             self.mystem.lemmatize(text)))
         for stop_word in self.stop_words:
             while stop_word in article_tokens:
                 article_tokens.remove(stop_word)
         articles_tokens.extend(article_tokens)
     text = Text(articles_tokens)
     f_dist = FreqDist(text)
     if not constraint:
         if os.path.exists(
                 generate_file_name_with_postfix(
                     self.config['articles_frequency_info_file'],
                     str(codex_id))):
             os.remove(
                 generate_file_name_with_postfix(
                     self.config['articles_frequency_info_file'],
                     str(codex_id)))
         with open(generate_file_name_with_postfix(
                 self.config['articles_frequency_info_file'],
                 str(codex_id)),
                   mode='w') as articles_frequency_info_file:
             articles_frequency_info_writer = csv.writer(
                 articles_frequency_info_file,
                 delimiter=',',
                 quotechar='"',
                 quoting=csv.QUOTE_MINIMAL)
             articles_frequency_info_writer.writerow(['word', 'frequency'])
             for frequency_info in f_dist.most_common(100):
                 articles_frequency_info_writer.writerow([
                     frequency_info[0],
                     frequency_info[1] / len(articles_tokens)
                 ])
     else:
         if os.path.exists(
                 generate_file_name_with_postfix(
                     self.
                     config['articles_frequency_info_file_with_constraint'],
                     str(codex_id))):
             os.remove(
                 generate_file_name_with_postfix(
                     self.
                     config['articles_frequency_info_file_with_constraint'],
                     str(codex_id)))
         with open(generate_file_name_with_postfix(
                 self.
                 config['articles_frequency_info_file_with_constraint'],
                 str(codex_id)),
                   mode='w') as articles_frequency_info_file:
             articles_frequency_info_writer = csv.writer(
                 articles_frequency_info_file,
                 delimiter=',',
                 quotechar='"',
                 quoting=csv.QUOTE_MINIMAL)
             articles_frequency_info_writer.writerow(['word', 'frequency'])
             f_dist = list(
                 filter(lambda item: item[1] > constraint, f_dist.items()))
             for frequency_info in f_dist:
                 articles_frequency_info_writer.writerow([
                     frequency_info[0],
                     frequency_info[1] / len(articles_tokens)
                 ])