示例#1
0
def save_topic_statistic(topic_name):
    """
    collect and save statistics on this topic in the database
    :param topic_name: название темы
    :return: None
    """
    documents = Document.select().where(Document.topic == Topic.select().where(
        Topic.name == topic_name))
    avg_length = 0
    for document in documents:
        avg_length += len(re.findall(r'\w+', document.text))
    try:
        avg_length /= len(documents)
    except ZeroDivisionError:
        avg_length = 0
        print("problem with save_topic_statistic, url =", topic_name)

    text = ' '.join(document.text for document in documents)

    statistic = calculate_statistic(text)
    topic = Topic.select().where(Topic.name == topic_name).get()
    TopicStatistic.create(topic=topic,
                          avg_document_len=avg_length,
                          documents_number=len(documents),
                          length_distribution=json.dumps(statistic['length']),
                          occurrences_distribution=json.dumps(
                              statistic['occurrence']))
示例#2
0
def get_topic_description(topic_name):
    """
    :param topic_name:
    :return: string with description or None if the topic doesn't exist
    """
    response = Topic.select().where(Topic.name == topic_name)
    if len(response) == 0:
        return None
    return response.get().description
示例#3
0
def get_fresh_topics(number):
    """
    :param number: number of the topics
    :return: list of <class 'data_base.Topic'>
    """
    topics = Topic().select().join(Document).\
        where(Topic.name == Document.topic and
              Document.last_update ==
              Document().select(fn.Max(Document.last_update)).
              where(Document.topic == Topic.name)).\
        order_by(-Document.last_update)
    return [topic for topic in topics][:number]
示例#4
0
def get_avg_document_len(topic_name):
    """
    get the average length of a topic document
    :param topic_name
    :return: the average length or None if the topic doesn't exist
    """
    statistic = TopicStatistic.select().\
        where(TopicStatistic.topic == Topic.select().
              where(Topic.name == topic_name))
    if len(statistic) == 0:
        return None
    return statistic.get().avg_document_len
示例#5
0
def get_documents_number(topic_name):
    """
    get number of documents in the topic
    :param topic_name:
    :return: number of documents or None if the topic doesn't exist
    """
    statistic = TopicStatistic.select().\
        where(TopicStatistic.topic == Topic.select().
              where(Topic.name == topic_name))
    if len(statistic) == 0:
        return None
    return statistic.get().documents_number
示例#6
0
def get_topic_fresh_news(topic_name, number):
    """
    get the freshest news for the topic
    :param topic_name:
    :param number:
    :return: list of <class 'data_base.Document'> or None if the topic doesn't exist
    """
    if len(Topic.select().where(Topic.name == topic_name)) == 0:
        return None
    return [
        news for news in Document.select().where(
            Document.topic == topic_name).order_by(-Document.last_update)
    ][:number]
示例#7
0
def topic_word_cloud(topic_name, file_name):
    """
    build a word cloud across all documents in a given topic
    :param topic_name:
    :param file_name: file to save the wordcloud
    :return: True - everything is OK, False - something went wrong
    """
    documents = Document.select().\
        where(Document.topic == Topic.select().
              where(Topic.name == topic_name))
    if len(documents) == 0:
        return False
    text = ' '.join(document.text for document in documents)
    make_word_cloud(text, file_name)
    return True
示例#8
0
def get_best_words(topic_name, number):
    """
    get words that describe the topic
    :param topic_name:
    :param number: number of words
    :return: list of tags or None if the topic doesn't exist
    """
    if len(Topic.select().where(Topic.name == topic_name)) == 0:
        return None
    documents = Document.select().\
        where(Document.topic == Topic.select().
              where(Topic.name == topic_name))
    word_occurrence = defaultdict(lambda: 0)
    words = re.findall(r'\w+',
                       ' '.join(document.title for document in documents))
    morph = pymorphy2.MorphAnalyzer()
    for word in words:
        morph_information = morph.parse(word)[0]
        if 'NOUN' in morph_information.tag or 'UNKN' in morph_information.tag:
            word_occurrence[str(morph_information.normal_form)] += 1

    word_list = [word for word in word_occurrence]
    word_list.sort(key=lambda tag: -word_occurrence[tag])
    return word_list[:number]
示例#9
0
def make_distribution_plot(title_or_name, object, file_name1, file_name2):
    """
    create plot of the distribution for the document/topic
    :param title_or_name:
    :param object: 'document' or 'topic'
    :param file_name1: file to save first plot
    :param file_name2: file to save second plot
    :return: True - everything is OK, False - something went wrong
    """
    if object == 'document':
        statistic = DocumentStatistic.select().\
            where(DocumentStatistic.document == Document.select().
                  where(Document.title == title_or_name))
    elif object == 'topic':
        statistic = TopicStatistic.select().\
            where(TopicStatistic.topic == Topic.select().
                  where(Topic.name == title_or_name))
    else:
        return False
    if len(statistic) == 0:
        return False
    make_plot(data=json.loads(statistic.get().length_distribution),
              label='The length distribution',
              xlabel="Word's length",
              ylabel='Number of words with such length')

    plt.savefig(file_name1)
    plt.close()

    make_plot(data=[0] + json.loads(statistic.get().occurrences_distribution)
              [config.MIN_OCCURRENCE:config.MAX_OCCURRENCE],
              label="The frequency distribution of words",
              xlabel='The frequency of the word',
              ylabel='The number of words with that frequency')
    plt.savefig(file_name2)

    return True
示例#10
0
def parse_and_save_topics():
    """
    parse topics and, if necessary, adds them to the database
    :return: None
    """
    session = requests.Session()
    session.max_redirects = config.REDIRECTS
    data = BeautifulSoup(session.get("https://www.rbc.ru/story/").text, 'lxml')
    # collects all of the topics
    topics = data.find_all('div', {'class': 'item item_story js-story-item'})
    for topic in topics:
        url = topic.find('a', {'class': 'item__link no-injects'})['href'].\
            strip()
        title = topic.find('span', {'class': 'item__title'}).text.strip()
        description = topic.find('span', {'class': 'item__text'}).text.strip()
        # checks that the theme is not yet in the database
        if len(Topic.select().where(Topic.url == url)) == 0:
            Topic.create(url=url, name=title, description=description)
        # or that her description changed.
        elif Topic.select().where(Topic.url == url).\
                get().description != description:
            Topic.update(description=description).\
                where(Topic.url == url).execute()
示例#11
0
def parse_ans_save_documents(topic_name):
    """
    parse all documents for this topic
    and if necessary, add them to the database
    :param topic_name:
    :return: None
    """
    topic = Topic.select().where(Topic.name == topic_name)

    if len(topic) == 0:
        return

    topic_url = topic.get().url
    session = requests.Session()
    session.max_redirects = config.REDIRECTS
    data = BeautifulSoup(session.get(topic_url).text, 'lxml')
    documents = data.\
        find_all('div', {'class': 'item item_story-single js-story-item'})

    for document in documents:
        url = document.find(
            'a', {'class': 'item__link no-injects js-yandex-counter'
                  })['href'].strip()

        title = document.find('span', {'class': 'item__title'}).text.strip()
        locale.setlocale(locale.LC_ALL, 'ru_RU.UTF-8')
        last_update = dateparser.parse(document.find('span', {
            'class': 'item__info'
        }).text,
                                       languages=['ru'])
        """checks that the database has no documents with such
        url or it needs to be updated"""
        if len(Document.select().where(
                Document.url == url
                and Document.last_update == last_update)) == 0:
            """remembers that we need to recalculate
            statistics on the subject of the document"""
            updated_topics.add(topic_name)
            """removes tags related to this document because 
            they could change"""
            Tag.delete().where(Tag.document == Document.select().where(
                Document.url == url)).execute()
            # removes previous statistics
            DocumentStatistic.delete().\
                where(DocumentStatistic.document == Document.select().
                      where(Document.url == url)).execute()
            Document.delete().where(Document.url == url).execute()
            page = get_document_text_and_tags(url)
            text = page['text']
            cur_top = Topic.select().where(Topic.name == topic_name).get()

            new_document = Document(url=url,
                                    title=title,
                                    topic=cur_top,
                                    ast_update=last_update,
                                    text=text,
                                    last_update=last_update)

            new_document.save()
            # gets statistics
            save_document_statistic(new_document)
            # adds tags
            for tag in page['tags']:
                Tag.create(document=new_document, name=tag)
示例#12
0
            cur_top = Topic.select().where(Topic.name == topic_name).get()

            new_document = Document(url=url,
                                    title=title,
                                    topic=cur_top,
                                    ast_update=last_update,
                                    text=text,
                                    last_update=last_update)

            new_document.save()
            # gets statistics
            save_document_statistic(new_document)
            # adds tags
            for tag in page['tags']:
                Tag.create(document=new_document, name=tag)


if __name__ == '__main__':
    data_base.connect()
    data_base.create_tables(
        [Document, Tag, Topic, DocumentStatistic, TopicStatistic])
    parse_and_save_topics()
    for topic in Topic.select():
        parse_ans_save_documents(topic.name)

    for topic_name in updated_topics:
        TopicStatistic.delete().\
            where(TopicStatistic.topic == Topic.select().
                  where(Topic.name == topic_name)).execute()
        save_topic_statistic(topic_name)