def save_topic_statistic(topic_name): """ collect and save statistics on this topic in the database :param topic_name: название темы :return: None """ documents = Document.select().where(Document.topic == Topic.select().where( Topic.name == topic_name)) avg_length = 0 for document in documents: avg_length += len(re.findall(r'\w+', document.text)) try: avg_length /= len(documents) except ZeroDivisionError: avg_length = 0 print("problem with save_topic_statistic, url =", topic_name) text = ' '.join(document.text for document in documents) statistic = calculate_statistic(text) topic = Topic.select().where(Topic.name == topic_name).get() TopicStatistic.create(topic=topic, avg_document_len=avg_length, documents_number=len(documents), length_distribution=json.dumps(statistic['length']), occurrences_distribution=json.dumps( statistic['occurrence']))
def get_topic_description(topic_name): """ :param topic_name: :return: string with description or None if the topic doesn't exist """ response = Topic.select().where(Topic.name == topic_name) if len(response) == 0: return None return response.get().description
def get_fresh_topics(number): """ :param number: number of the topics :return: list of <class 'data_base.Topic'> """ topics = Topic().select().join(Document).\ where(Topic.name == Document.topic and Document.last_update == Document().select(fn.Max(Document.last_update)). where(Document.topic == Topic.name)).\ order_by(-Document.last_update) return [topic for topic in topics][:number]
def get_avg_document_len(topic_name): """ get the average length of a topic document :param topic_name :return: the average length or None if the topic doesn't exist """ statistic = TopicStatistic.select().\ where(TopicStatistic.topic == Topic.select(). where(Topic.name == topic_name)) if len(statistic) == 0: return None return statistic.get().avg_document_len
def get_documents_number(topic_name): """ get number of documents in the topic :param topic_name: :return: number of documents or None if the topic doesn't exist """ statistic = TopicStatistic.select().\ where(TopicStatistic.topic == Topic.select(). where(Topic.name == topic_name)) if len(statistic) == 0: return None return statistic.get().documents_number
def get_topic_fresh_news(topic_name, number): """ get the freshest news for the topic :param topic_name: :param number: :return: list of <class 'data_base.Document'> or None if the topic doesn't exist """ if len(Topic.select().where(Topic.name == topic_name)) == 0: return None return [ news for news in Document.select().where( Document.topic == topic_name).order_by(-Document.last_update) ][:number]
def topic_word_cloud(topic_name, file_name): """ build a word cloud across all documents in a given topic :param topic_name: :param file_name: file to save the wordcloud :return: True - everything is OK, False - something went wrong """ documents = Document.select().\ where(Document.topic == Topic.select(). where(Topic.name == topic_name)) if len(documents) == 0: return False text = ' '.join(document.text for document in documents) make_word_cloud(text, file_name) return True
def get_best_words(topic_name, number): """ get words that describe the topic :param topic_name: :param number: number of words :return: list of tags or None if the topic doesn't exist """ if len(Topic.select().where(Topic.name == topic_name)) == 0: return None documents = Document.select().\ where(Document.topic == Topic.select(). where(Topic.name == topic_name)) word_occurrence = defaultdict(lambda: 0) words = re.findall(r'\w+', ' '.join(document.title for document in documents)) morph = pymorphy2.MorphAnalyzer() for word in words: morph_information = morph.parse(word)[0] if 'NOUN' in morph_information.tag or 'UNKN' in morph_information.tag: word_occurrence[str(morph_information.normal_form)] += 1 word_list = [word for word in word_occurrence] word_list.sort(key=lambda tag: -word_occurrence[tag]) return word_list[:number]
def make_distribution_plot(title_or_name, object, file_name1, file_name2): """ create plot of the distribution for the document/topic :param title_or_name: :param object: 'document' or 'topic' :param file_name1: file to save first plot :param file_name2: file to save second plot :return: True - everything is OK, False - something went wrong """ if object == 'document': statistic = DocumentStatistic.select().\ where(DocumentStatistic.document == Document.select(). where(Document.title == title_or_name)) elif object == 'topic': statistic = TopicStatistic.select().\ where(TopicStatistic.topic == Topic.select(). where(Topic.name == title_or_name)) else: return False if len(statistic) == 0: return False make_plot(data=json.loads(statistic.get().length_distribution), label='The length distribution', xlabel="Word's length", ylabel='Number of words with such length') plt.savefig(file_name1) plt.close() make_plot(data=[0] + json.loads(statistic.get().occurrences_distribution) [config.MIN_OCCURRENCE:config.MAX_OCCURRENCE], label="The frequency distribution of words", xlabel='The frequency of the word', ylabel='The number of words with that frequency') plt.savefig(file_name2) return True
def parse_and_save_topics(): """ parse topics and, if necessary, adds them to the database :return: None """ session = requests.Session() session.max_redirects = config.REDIRECTS data = BeautifulSoup(session.get("https://www.rbc.ru/story/").text, 'lxml') # collects all of the topics topics = data.find_all('div', {'class': 'item item_story js-story-item'}) for topic in topics: url = topic.find('a', {'class': 'item__link no-injects'})['href'].\ strip() title = topic.find('span', {'class': 'item__title'}).text.strip() description = topic.find('span', {'class': 'item__text'}).text.strip() # checks that the theme is not yet in the database if len(Topic.select().where(Topic.url == url)) == 0: Topic.create(url=url, name=title, description=description) # or that her description changed. elif Topic.select().where(Topic.url == url).\ get().description != description: Topic.update(description=description).\ where(Topic.url == url).execute()
def parse_ans_save_documents(topic_name): """ parse all documents for this topic and if necessary, add them to the database :param topic_name: :return: None """ topic = Topic.select().where(Topic.name == topic_name) if len(topic) == 0: return topic_url = topic.get().url session = requests.Session() session.max_redirects = config.REDIRECTS data = BeautifulSoup(session.get(topic_url).text, 'lxml') documents = data.\ find_all('div', {'class': 'item item_story-single js-story-item'}) for document in documents: url = document.find( 'a', {'class': 'item__link no-injects js-yandex-counter' })['href'].strip() title = document.find('span', {'class': 'item__title'}).text.strip() locale.setlocale(locale.LC_ALL, 'ru_RU.UTF-8') last_update = dateparser.parse(document.find('span', { 'class': 'item__info' }).text, languages=['ru']) """checks that the database has no documents with such url or it needs to be updated""" if len(Document.select().where( Document.url == url and Document.last_update == last_update)) == 0: """remembers that we need to recalculate statistics on the subject of the document""" updated_topics.add(topic_name) """removes tags related to this document because they could change""" Tag.delete().where(Tag.document == Document.select().where( Document.url == url)).execute() # removes previous statistics DocumentStatistic.delete().\ where(DocumentStatistic.document == Document.select(). where(Document.url == url)).execute() Document.delete().where(Document.url == url).execute() page = get_document_text_and_tags(url) text = page['text'] cur_top = Topic.select().where(Topic.name == topic_name).get() new_document = Document(url=url, title=title, topic=cur_top, ast_update=last_update, text=text, last_update=last_update) new_document.save() # gets statistics save_document_statistic(new_document) # adds tags for tag in page['tags']: Tag.create(document=new_document, name=tag)
cur_top = Topic.select().where(Topic.name == topic_name).get() new_document = Document(url=url, title=title, topic=cur_top, ast_update=last_update, text=text, last_update=last_update) new_document.save() # gets statistics save_document_statistic(new_document) # adds tags for tag in page['tags']: Tag.create(document=new_document, name=tag) if __name__ == '__main__': data_base.connect() data_base.create_tables( [Document, Tag, Topic, DocumentStatistic, TopicStatistic]) parse_and_save_topics() for topic in Topic.select(): parse_ans_save_documents(topic.name) for topic_name in updated_topics: TopicStatistic.delete().\ where(TopicStatistic.topic == Topic.select(). where(Topic.name == topic_name)).execute() save_topic_statistic(topic_name)