示例#1
0
def fill_article_dict(articleDataDict, pageTree, domain):

    maxArticleBodies = min(rss_config.REQUEST_ARTICLE_BODIES_MAX, 10)
    maxArticlePosts = round(rss_config.REQUEST_ARTICLE_POSTS_MAX /
                            maxArticleBodies)  # set 0 for all posts

    parentPages = {}
    parentPages["stamps"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="reply-count"]/span[@data-xf-init="tooltip"]/text()')
    parentPages["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@qid="thread-item-parent"]/div[@qid="thread-item"]/div/div[@class="structItem-title"]/a[@qid="thread-item-title"]/text()'
    )
    parentPages["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@qid="thread-item-parent"]/div[@qid="thread-item"]/div/div[@class="structItem-title"]/a[@qid="thread-item-title"]/@href'
    )

    # remove unwanted content: titles
    dictList = [
        "$",
        "*:::::::the official what did you do to you mkiv today thread::::::::*",
        "??",
        "Ask a Simple Question",
    ]
    parentPages = parsers_common.article_data_dict_clean(
        parentPages, dictList, "in", "titles")

    # teemade läbivaatamine
    for i in parsers_common.article_urls_range(parentPages["urls"]):
        # teemalehe sisu hankimine
        if parsers_common.should_get_article_body(i, maxArticleBodies):
            curParentUrl = parsers_common.get(parentPages["urls"], i)
            curParentUrl = curParentUrl + "page-1000"
            parentPagesStamp = parsers_common.get(parentPages["stamps"], i)
            # load article into tree
            pageTree = parsers_common.get_article_tree(
                domain,
                curParentUrl,
                cache='cacheStamped',
                pageStamp=parentPagesStamp)

            articlePostsDict = {}
            articlePostsDict["authors"] = parsers_common.xpath_to(
                "list", pageTree, '//h4[@qid="message-username"]//text()')
            articlePostsDict["descriptions"] = parsers_common.xpath_to(
                "list",
                pageTree,
                '//article/div[@class="bbWrapper"]',
                parent=True)
            articlePostsDict["pubDates"] = parsers_common.xpath_to(
                "list", pageTree,
                '//div[@class="message-attribution-main"]/a[@class="u-concealed"][2]/time/@datetime'
            )
            articlePostsDict["urls"] = parsers_common.xpath_to(
                "list", pageTree,
                '//div[@class="message-attribution-main"]/a[@class="u-concealed"][2]/@href'
            )

            # teema postituste läbivaatamine
            for j in parsers_common.article_posts_range(
                    articlePostsDict["urls"], maxArticlePosts):
                # author
                articleDataDict["authors"] = parsers_common.list_add(
                    articleDataDict["authors"], j,
                    parsers_common.get(articlePostsDict["authors"], j))

                # description
                curArtDesc = parsers_common.get(
                    articlePostsDict["descriptions"], j)
                articleDataDict["descriptions"] = parsers_common.list_add(
                    articleDataDict["descriptions"], j, curArtDesc)

                # pubDates
                curArtPubDate = parsers_common.get(
                    articlePostsDict["pubDates"], j)
                curArtPubDate = parsers_datetime.raw_to_datetime(
                    curArtPubDate,
                    "%Y-%m-%dT%H:%M:%S%z")  # 2021-01-28T16:15:42-0500
                articleDataDict["pubDates"] = parsers_common.list_add(
                    articleDataDict["pubDates"], j, curArtPubDate)

                # title
                curArtTitle = parsers_common.get(parentPages["titles"], i)
                curArtTitle = parsers_common.str_title_at_domain(
                    curArtTitle, domain)
                articleDataDict["titles"] = parsers_common.list_add(
                    articleDataDict["titles"], j, curArtTitle)

                # url
                curArtUrl = parsers_common.get(articlePostsDict["urls"], j)
                articleDataDict["urls"] = parsers_common.list_add(
                    articleDataDict["urls"], j, curArtUrl)

                rss_print.print_debug(
                    __file__, "teema " + str(i + 1) + " postitus nr. " +
                    str(j + 1) + "/(" + str(len(articlePostsDict["urls"])) +
                    ") on " + articlePostsDict["urls"][j], 2)

    return articleDataDict
示例#2
0
def fill_article_dict(articleDataDict, pageTree, domain):

    maxArticleBodies = min(rss_config.REQUEST_ARTICLE_BODIES_MAX, 5)
    maxArticlePosts = round(rss_config.REQUEST_ARTICLE_POSTS_MAX /
                            maxArticleBodies)  # set 0 for all posts

    parentPages = {}
    parentPages["stamps"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="tabs-content tab-active shadow"]/ul[@class="adds-list list"]/li[@class="adds-list-item"]/a[h3/@class="adds-list-title"]/h3[@class="adds-list-title"]/span[@class="adds-list-meta"]/text()'
    )
    parentPages["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="tabs-content tab-active shadow"]/ul[@class="adds-list list"]/li[@class="adds-list-item"]/a[h3/@class="adds-list-title"]/h3[@class="adds-list-title"]/text()'
    )
    parentPages["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="tabs-content tab-active shadow"]/ul[@class="adds-list list"]/li[@class="adds-list-item"]/a[h3/@class="adds-list-title"]/@href'
    )

    # muudame suuna sobivaks
    parentPages = parsers_common.dict_reverse_order(parentPages)

    # remove unwanted content: titles
    dictList = [
        "AMD",
        "Apple",
        "Assassin",
        "Batman",
        "Battlefield",
        "Call of Duty",
        "Cyberpunk",
        "Diablo 2",
        "Dying",
        "Escape From Tarkov",
        "Euro Truck",
        "Evil",
        "FIFA",
        "Far Cry",
        "Forza",
        "Galaxy",
        "Grand Theft",
        "IPhon",
        "Kindle",
        "MMORPG",
        "MSI",
        "MacBook",
        "MacOS",
        "Mafia",
        "Mass Effect",
        "Meizu",
        "Minecraft",
        "Nintendo",
        "Pixel",
        "PlayStation",
        "Steam",
        "Tanks",
        "Vidia",
        "War Thunder",
        "Watercool",
        "Windows",
        "Xbox",
        "arvutikast",
        "exile",
        "foorumiga seotud",
        "konsool",
        "korpust",
        "moderaatorite",
        "seotud vead",
        "siia lingid",
        "toiteplok",
    ]
    parentPages = parsers_common.article_data_dict_clean(
        parentPages, dictList, "in", "titles")

    # teemade läbivaatamine
    for i in parsers_common.article_urls_range(parentPages["urls"]):
        # teemalehe sisu hankimine
        if parsers_common.should_get_article_body(i, maxArticleBodies):
            parentPages["stamps"][i] = parentPages["stamps"][i].split("/")[0]
            parentPages["urls"][i] = parentPages["urls"][i].split("&sid=")[0]
            pageTree = parsers_common.get_article_tree(
                domain,
                parentPages["urls"][i],
                cache='cacheStamped',
                pageStamp=parentPages["stamps"][i])

            articlePostsDict = {}
            articlePostsDict["authors"] = parsers_common.xpath_to(
                "list", pageTree,
                '//table[@class="forumline"]/tr/td[1]/span[@class="name"]/b/a/text()'
            )
            articlePostsDict["descriptions1"] = parsers_common.xpath_to(
                "list",
                pageTree,
                '//table[@class="forumline"]/tr/td[2]/table/tr[3]/td/span[@class="postbody"][1]',
                parent=True)
            articlePostsDict["descriptions2"] = parsers_common.xpath_to(
                "list",
                pageTree,
                '//table[@class="forumline"]/tr/td[2]/table/tr[3]/td/span[@class="postbody"][2]',
                parent=True)
            articlePostsDict["pubDates"] = parsers_common.xpath_to(
                "list", pageTree,
                '//table[@class="forumline"]/tr/td[2]/table/tr[1]/td/span[@class="postdetails"]/span[@class="postdetails"][1]/text()[1]'
            )
            articlePostsDict["urls"] = parsers_common.xpath_to(
                "list", pageTree,
                '//table[@class="forumline"]/tr/td[2]/table/tr[1]/td/span[@class="postdetails"]/a/@href'
            )

            # teema postituste läbivaatamine
            for j in parsers_common.article_posts_range(
                    articlePostsDict["urls"], maxArticlePosts):
                # author
                articleDataDict["authors"] = parsers_common.list_add(
                    articleDataDict["authors"], j,
                    parsers_common.get(articlePostsDict["authors"], j))

                # description
                curArtDesc = parsers_common.get(
                    articlePostsDict["descriptions1"], j)
                if not curArtDesc:
                    curArtDesc = parsers_common.get(
                        articlePostsDict["descriptions2"], j)
                curArtDesc = curArtDesc.replace(
                    '</div><div class="quotecontent">', '<br>')
                curArtDesc = parsers_common.fix_quatation_tags(
                    curArtDesc, '<div class="quotetitle">', "</div>",
                    "<blockquote>", "</blockquote>")
                articleDataDict["descriptions"] = parsers_common.list_add(
                    articleDataDict["descriptions"], j, curArtDesc)

                # pubDates
                curArtPubDate = parsers_common.get(
                    articlePostsDict["pubDates"], j)
                curArtPubDate = curArtPubDate[0:16]
                curArtPubDate = parsers_datetime.guess_datetime(
                    curArtPubDate)  # 14.07.2020 07:59
                articleDataDict["pubDates"] = parsers_common.list_add(
                    articleDataDict["pubDates"], j, curArtPubDate)

                # title
                curArtTitle = parsers_common.get(parentPages["titles"], i)
                curArtTitle = parsers_common.str_title_at_domain(
                    curArtTitle, domain)
                articleDataDict["titles"] = parsers_common.list_add(
                    articleDataDict["titles"], j, curArtTitle)

                # url
                curArtUrl = parsers_common.get(articlePostsDict["urls"], j)
                curArtUrl = parsers_common.link_add_end(
                    curArtUrl, articlePostsDict["urls"][j])
                articleDataDict["urls"] = parsers_common.list_add(
                    articleDataDict["urls"], j, curArtUrl)

                rss_print.print_debug(
                    __file__, "teema " + str(i + 1) + " postitus nr. " +
                    str(j + 1) + "/(" + str(len(articlePostsDict["urls"])) +
                    ") on " + articlePostsDict["urls"][j], 2)

    return articleDataDict
示例#3
0
def fill_article_dict(articleDataDict, pageTree, domain):

    maxArticleBodies = min(rss_config.REQUEST_ARTICLE_BODIES_MAX, 5)
    maxArticlePosts = round(rss_config.REQUEST_ARTICLE_POSTS_MAX /
                            maxArticleBodies)  # set 0 for all posts

    parentPages = {}
    parentPages["stamps"] = parsers_common.xpath_to(
        "list", pageTree, '//div[@data-lang="Vastuseid"]/a/text()')
    parentPages["titles"] = parsers_common.xpath_to(
        "list", pageTree, '//span[@class=" subject_old"]/a/text()')
    parentPages["urls"] = parsers_common.xpath_to(
        "list", pageTree, '//span[@class=" subject_old"]/a/@href')

    # remove unwanted content: titles
    #dictList = []
    #parentPages = parsers_common.article_data_dict_clean(parentPages, dictList, "in", "titles")

    # teemade läbivaatamine
    for i in parsers_common.article_urls_range(parentPages["urls"]):
        # teemalehe sisu hankimine
        if parsers_common.should_get_article_body(i, maxArticleBodies):
            curParentUrl = parsers_common.get(parentPages["urls"], i)
            curParentUrl = curParentUrl.split("&sid=")[0]
            curParentUrl = curParentUrl + "&page=-1"
            curParentUrl = parsers_common.str_domain_url(domain, curParentUrl)
            parentPagesStamp = parsers_common.get(parentPages["stamps"], i)
            # load article into tree
            pageTree = parsers_common.get_article_tree(
                domain,
                curParentUrl,
                cache='cacheStamped',
                pageStamp=parentPagesStamp)

            articlePostsDict = {}
            articlePostsDict["authors"] = parsers_common.xpath_to(
                "list", pageTree,
                '//div[@class="author_information"]/strong/span[@class="largetext"]/a/text()'
            )
            articlePostsDict["descriptions"] = parsers_common.xpath_to(
                "list",
                pageTree,
                '//div[@class="post_body scaleimages"]',
                parent=True)
            articlePostsDict["pubDates"] = parsers_common.xpath_to(
                "list",
                pageTree,
                '//div[@class="post_head"]/span[@class="post_date"]',
                parent=True)
            articlePostsDict["urls"] = parsers_common.xpath_to(
                "list", pageTree,
                '//div[@class="post_content"]/div[@class="post_head"]/div/strong/a/@href'
            )

            # teema postituste läbivaatamine
            for j in parsers_common.article_posts_range(
                    articlePostsDict["urls"], maxArticlePosts):
                # author
                articleDataDict["authors"] = parsers_common.list_add(
                    articleDataDict["authors"], j,
                    parsers_common.get(articlePostsDict["authors"], j))

                # description
                curArtDesc = parsers_common.get(
                    articlePostsDict["descriptions"], j)
                curArtDesc = curArtDesc.replace(
                    '</div><div class="quotecontent">', '<br>')
                curArtDesc = parsers_common.fix_quatation_tags(
                    curArtDesc, '<div class="quotetitle">', "</div>",
                    "<blockquote>", "</blockquote>")
                articleDataDict["descriptions"] = parsers_common.list_add(
                    articleDataDict["descriptions"], j, curArtDesc)

                # pubDates
                curArtPubDate = parsers_common.get(
                    articlePostsDict["pubDates"], j)
                curArtPubDate = parsers_common.str_lchop(
                    curArtPubDate, '<span title="')
                curArtPubDate = curArtPubDate.split(" <span class")[0]
                if "Eile" in curArtPubDate or "Täna" in curArtPubDate:
                    curArtPubDate = curArtPubDate.split('">')[1]
                    curArtPubDate = parsers_datetime.replace_string_with_timeformat(
                        curArtPubDate,
                        "Eile</span>",
                        "%d-%m-%Y",
                        offsetDays=-1)
                    curArtPubDate = parsers_datetime.replace_string_with_timeformat(
                        curArtPubDate, "Täna</span>", "%d-%m-%Y", offsetDays=0)
                else:
                    curArtPubDate = curArtPubDate.split('">')[0]
                curArtPubDate = parsers_datetime.guess_datetime(curArtPubDate)
                articleDataDict["pubDates"] = parsers_common.list_add(
                    articleDataDict["pubDates"], j, curArtPubDate)

                # title
                curArtTitle = parsers_common.get(parentPages["titles"], i)
                curArtTitle = parsers_common.str_title_at_domain(
                    curArtTitle, domain)
                articleDataDict["titles"] = parsers_common.list_add(
                    articleDataDict["titles"], j, curArtTitle)

                # url
                curArtUrl = parsers_common.get(articlePostsDict["urls"], j)
                curArtUrl = parsers_common.link_add_end(
                    curArtUrl, articlePostsDict["urls"][j])
                articleDataDict["urls"] = parsers_common.list_add(
                    articleDataDict["urls"], j, curArtUrl)

                rss_print.print_debug(
                    __file__, "teema " + str(i + 1) + " postitus nr. " +
                    str(j + 1) + "/(" + str(len(articlePostsDict["urls"])) +
                    ") on " + articlePostsDict["urls"][j], 2)

    return articleDataDict
示例#4
0
def fill_article_dict(articleDataDict, pageTree, domain):

    maxArticleBodies = min(rss_config.REQUEST_ARTICLE_BODIES_MAX, 8)
    maxArticlePosts = round(rss_config.REQUEST_ARTICLE_POSTS_MAX / maxArticleBodies)  # set 0 for all posts

    parentPages = {}
    version = 0
    if "kipper.ee" in domain or "militaar.net" in domain:
        version = 1
        parentPages["stamps"] = parsers_common.xpath_to("list", pageTree, '//table[@class="tablebg"]//tr/td/p[@class="topicdetails"]/text()')
        parentPages["titles"] = parsers_common.xpath_to("list", pageTree, '//table[@class="tablebg"]//tr/td/a[@class="topictitle"]/text()')
        parentPages["urls"] =   parsers_common.xpath_to("list", pageTree, '//table[@class="tablebg"]//tr/td/a[@class="topictitle"]/@href')
    else:
        parentPages["stamps"] = parsers_common.xpath_to("list", pageTree, '//ul[2]/li/dl/dd[@class="posts"]/text()')
        parentPages["titles"] = parsers_common.xpath_to("list", pageTree, '//ul[2]/li/dl/*/div/a[@class="topictitle"]/text()')
        parentPages["urls"] =   parsers_common.xpath_to("list", pageTree, '//ul[2]/li/dl/*/div/a[@class="topictitle"]/@href')

    if not parentPages["urls"] and "arutelud.com" in domain:
        rss_print.print_debug(__file__, "aktiivseid teemasid ei leitud, arutelude foorumis külastame mammutteemat", 1)
        parentPages["stamps"] = parsers_common.list_add_or_assign(parentPages["stamps"], 0, "")
        parentPages["titles"] = parsers_common.list_add_or_assign(parentPages["titles"], 0, "Arutelud")
        parentPages["urls"] =   parsers_common.list_add_or_assign(parentPages["urls"], 0, "https://arutelud.com/viewtopic.php?f=3&t=4&sd=d&sk=t&st=7")

    # remove unwanted content: titles
    dictList = [
        "Race.Fi:",
        "Write my",
        "PÕLVAMAA, VÕRUMAA JA VALGAMAA CB JA HAM SIDE",
    ]
    parentPages = parsers_common.article_data_dict_clean(parentPages, dictList, "in", "titles")

    # teemade läbivaatamine
    for i in parsers_common.article_urls_range(parentPages["urls"]):
        # teemalehe sisu hankimine
        if parsers_common.should_get_article_body(i, maxArticleBodies):
            curParentUrl = parsers_common.get(parentPages["urls"], i)
            curParentUrl = curParentUrl.split("&sid=")[0]
            curParentUrl = curParentUrl + "&start=100000"
            parentPagesStamp = parsers_common.get(parentPages["stamps"], i)
            # load article into tree
            pageTree = parsers_common.get_article_tree(domain, curParentUrl, cache='cacheStamped', pageStamp=parentPagesStamp)

            articlePostsDict = {}
            if version in (1, 2):
                rss_print.print_debug(__file__, "kasutame spetsiifilist hankimist, domain = " + domain, 2)
                articlePostsDict["authors"] =       parsers_common.xpath_to("list", pageTree, '//tr/td/b[@class="postauthor"]/text()')
                articlePostsDict["descriptions"] =  parsers_common.xpath_to("list", pageTree, '//tr/td/div[@class="postbody"][1]', parent=True)
                articlePostsDict["pubDates"] =      parsers_common.xpath_to("list", pageTree, '//tr/td[@class="gensmall"]/div[@style="float: right;"]/text()')
                articlePostsDict["urls"] =          parsers_common.xpath_to("list", pageTree, '//tr/td[@class="gensmall"]/div[@style="float: right;"]/a/@href')
            else:
                rss_print.print_debug(__file__, "kasutame üldist hankimist, domain = " + domain, 3)
                articlePostsDict["authors"] =       parsers_common.xpath_to("list", pageTree, '//p[@class="author"]//strong//text()')
                articlePostsDict["descriptions"] =  parsers_common.xpath_to("list", pageTree, '//div[@class="content"]', parent=True)
                articlePostsDict["pubDates"] =      parsers_common.xpath_to("list", pageTree, '//p[@class="author"]/time/@datetime')
                articlePostsDict["urls"] =          parsers_common.xpath_to("list", pageTree, '//p[@class="author"]/a/@href')

                if not articlePostsDict["pubDates"]:
                    articlePostsDict["pubDates"] = parsers_common.xpath_to("list", pageTree, '//p[@class="author"]/time/text()')
                if not articlePostsDict["pubDates"]:
                    articlePostsDict["pubDates"] = parsers_common.xpath_to("list", pageTree, '//p[@class="author"]/text()[1]')
                if len(articlePostsDict["pubDates"][0]) < 5:
                    rss_print.print_debug(__file__, "hangitud aeg[0] liiga lühike: '" + articlePostsDict["pubDates"][0] + "', proovime alternatiivi...", 0)
                    articlePostsDict["pubDates"] = parsers_common.xpath_to("list", pageTree, '//p[@class="author"]/text()[2]')
                    if len(articlePostsDict["pubDates"][0]) < 5:
                        rss_print.print_debug(__file__, "hangitud aeg[0] liiga lühike: '" + articlePostsDict["pubDates"][0] + "'", 0)
                    else:
                        rss_print.print_debug(__file__, "hangitud aeg[0]: '" + articlePostsDict["pubDates"][0] + "'", 4)
                if not articlePostsDict["pubDates"]:
                    rss_print.print_debug(__file__, "ei suutnud hankida ühtegi aega", 0)

            # teema postituste läbivaatamine
            for j in parsers_common.article_posts_range(articlePostsDict["urls"], maxArticlePosts):
                # author
                articleDataDict["authors"] = parsers_common.list_add(articleDataDict["authors"], j, parsers_common.get(articlePostsDict["authors"], j))

                # description
                curArtDesc = parsers_common.get(articlePostsDict["descriptions"], j)
                curArtDesc = curArtDesc.replace('</div><div class="quotecontent">', '<br>')
                curArtDesc = parsers_common.fix_quatation_tags(curArtDesc, '<div class="quotetitle">', "</div>", "<blockquote>", "</blockquote>")
                articleDataDict["descriptions"] = parsers_common.list_add(articleDataDict["descriptions"], j, curArtDesc)

                # pubDates
                curArtPubDate = parsers_common.get(articlePostsDict["pubDates"], j)
                curArtPubDate = parsers_datetime.months_to_int(curArtPubDate)
                curArtPubDate = parsers_datetime.remove_weekday_strings(curArtPubDate)
                curArtPubDate = parsers_datetime.replace_string_with_timeformat(curArtPubDate, "eile", "%d %m %Y", offsetDays=-1)
                curArtPubDate = parsers_datetime.replace_string_with_timeformat(curArtPubDate, "täna", "%d %m %Y", offsetDays=0)
                curArtPubDate = parsers_datetime.guess_datetime(curArtPubDate)
                articleDataDict["pubDates"] = parsers_common.list_add(articleDataDict["pubDates"], j, curArtPubDate)

                # title
                curArtTitle = parsers_common.get(parentPages["titles"], i)
                curArtTitle = parsers_common.str_title_at_domain(curArtTitle, domain)
                articleDataDict["titles"] = parsers_common.list_add(articleDataDict["titles"], j, curArtTitle)

                # url
                curArtUrl = parsers_common.get(articlePostsDict["urls"], j)
                curArtUrl = parsers_common.link_add_end(curArtUrl, articlePostsDict["urls"][j])
                articleDataDict["urls"] = parsers_common.list_add(articleDataDict["urls"], j, curArtUrl)

                rss_print.print_debug(__file__, "teema " + str(i + 1) + " postitus nr. " + str(j + 1) + "/(" + str(len(articlePostsDict["urls"])) + ") on " + articlePostsDict["urls"][j], 2)

    return articleDataDict
示例#5
0
def fill_article_dict(articleDataDict, pageTree, domain):

    maxArticleBodies = min(rss_config.REQUEST_ARTICLE_BODIES_MAX, 1)
    maxArticlePosts = round(rss_config.REQUEST_ARTICLE_POSTS_MAX /
                            maxArticleBodies)  # set 0 for all posts

    parentPages = {}
    parentPages["stamps"] = parsers_common.xpath_to(
        "list", pageTree, '//tbody/tr/th[@class="col-1"]/text()')
    parentPages["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//tbody/tr/th[@class="col-7 teemapealkiri"]/a/text()')
    parentPages["urls"] = parsers_common.xpath_to(
        "list", pageTree, '//tbody/tr/th[@class="col-4"]/a/@href')

    # remove unwanted content: titles
    dictList = [
        "Lõvide perekonna uus teema",
        "abort",
        "beebi",
        "ivf",
        "lapse",
        "rase ",
        "rased",
        "triibupüüdjad",
    ]
    parentPages = parsers_common.article_data_dict_clean(
        parentPages, dictList, "in", "titles")

    # teemade läbivaatamine
    for i in parsers_common.article_urls_range(parentPages["urls"]):
        # teemalehe sisu hankimine
        if parsers_common.should_get_article_body(i, maxArticleBodies):
            curParentUrl = parsers_common.get(parentPages["urls"], i)
            curParentUrl = curParentUrl.split("/#")[0]
            pageTree = parsers_common.get_article_tree(
                domain,
                curParentUrl,
                cache='cacheStamped',
                pageStamp=parentPages["stamps"][i])

            articlePostsDict = {}
            articlePostsDict["descriptions"] = parsers_common.xpath_to(
                "list",
                pageTree,
                '//div[@class="bbp-reply-content entry-content"]',
                parent=True)
            articlePostsDict["pubDates"] = parsers_common.xpath_to(
                "list", pageTree,
                '//div[@class="post_date date updated"]/text()')
            articlePostsDict["urls"] = parsers_common.xpath_to(
                "list", pageTree,
                '//div[@class="bbp-reply-header entry-title"]/@id')

            # teema postituste läbivaatamine
            for j in parsers_common.article_posts_range(
                    articlePostsDict["urls"], maxArticlePosts):
                # description
                curArtDesc = parsers_common.get(
                    articlePostsDict["descriptions"], j)
                curArtDesc = curArtDesc.split(
                    '<div class="gdrts-rating-block ')[0]
                curArtDesc = parsers_html.html_remove_single_parents(
                    curArtDesc)
                articleDataDict["descriptions"] = parsers_common.list_add(
                    articleDataDict["descriptions"], j, curArtDesc)

                # pubDates
                curArtPubDate = parsers_common.get(
                    articlePostsDict["pubDates"], j)
                curArtPubDate = parsers_datetime.guess_datetime(curArtPubDate)
                articleDataDict["pubDates"] = parsers_common.list_add(
                    articleDataDict["pubDates"], j, curArtPubDate)

                # title
                curArtTitle = parsers_common.get(parentPages["titles"], i)
                curArtTitle = parsers_common.str_title_at_domain(
                    curArtTitle, domain)
                articleDataDict["titles"] = parsers_common.list_add(
                    articleDataDict["titles"], j, curArtTitle)

                # url
                curArtUrl = parsers_common.get(
                    parentPages["urls"],
                    i) + "/#" + articlePostsDict["urls"][j]
                articleDataDict["urls"] = parsers_common.list_add(
                    articleDataDict["urls"], j, curArtUrl)

                rss_print.print_debug(
                    __file__, "teema " + str(i + 1) + " postitus nr. " +
                    str(j + 1) + "/(" + str(len(articlePostsDict["urls"])) +
                    ") on " + articlePostsDict["urls"][j], 2)

    # remove unwanted content: descriptions
    dictList = [
        " liba "
        "Kommentaar eemaldatud.",
        "Liba?",
    ]
    articleDataDict = parsers_common.article_data_dict_clean(
        articleDataDict, dictList, "in", "descriptions")

    return articleDataDict
示例#6
0
def fill_article_dict(articleDataDict, pageTree, domain):

    maxArticleBodies = min(rss_config.REQUEST_ARTICLE_BODIES_MAX, 5)
    maxArticlePosts = round(rss_config.REQUEST_ARTICLE_POSTS_MAX /
                            maxArticleBodies)  # set 0 for all posts

    parentPages = {}
    parentPages["stamps"] = parsers_common.xpath_to(
        "list", pageTree,
        '//table[@class="grid zebra forum"]/tr/td[@class="meta"][4]/span/text()'
    )
    parentPages["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//table[@class="grid zebra forum"]/tr/td[@class="title"]/a/@title')
    parentPages["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '//table[@class="grid zebra forum"]/tr/td[@class="title"]/a/@href')

    # remove unwanted content: titles
    dictList = [
        "Börsihai",
        "Cleveroni aktsiate ost/müük/oksjon",
        "Head uut aastat – prognoosid",
        "Keegi malet soovib mängida",
        "LHV Pank paremaks",
        "Uurimis- ja lõputööde küsimustikud",
    ]
    parentPages = parsers_common.article_data_dict_clean(
        parentPages, dictList, "in", "titles")

    # teemade läbivaatamine
    for i in parsers_common.article_urls_range(parentPages["urls"]):
        # teemalehe sisu hankimine
        if parsers_common.should_get_article_body(i, maxArticleBodies):
            # load article into tree
            pageTree = parsers_common.get_article_tree(
                domain,
                parentPages["urls"][i] +
                '?listEventId=jumpToPage&listEventParam=100&pagesOfMaxSize=true',
                cache='cacheStamped',
                pageStamp=parentPages["stamps"][i])

            articlePostsDict = {}
            articlePostsDict["authors"] = parsers_common.xpath_to(
                "list", pageTree,
                '//ul[@class="forum-topic"]/li/div[@class="col2"]/div[@class="forum-header clear"]/p[@class="author"]/strong/a/text()'
            )
            articlePostsDict["descriptions"] = parsers_common.xpath_to(
                "list",
                pageTree,
                '//ul[@class="forum-topic"]/li/div[@class="col2"]/div[@class="forum-content temporary-class"]',
                parent=True)
            articlePostsDict["pubDates"] = parsers_common.xpath_to(
                "list", pageTree,
                '//ul[@class="forum-topic"]/li/div[@class="col2"]/div[@class="forum-header clear"]/div/p[@class="permalink"]/a/node()'
            )
            articlePostsDict["urls"] = parsers_common.xpath_to(
                "list", pageTree,
                '//ul[@class="forum-topic"]/li/div[@class="col2"]/div[@class="forum-header clear"]/div/p[@class="permalink"]/a/@href'
            )

            # teema postituste läbivaatamine
            for j in parsers_common.article_posts_range(
                    articlePostsDict["urls"], maxArticlePosts):
                # author
                articleDataDict["authors"] = parsers_common.list_add(
                    articleDataDict["authors"], j,
                    parsers_common.get(articlePostsDict["authors"], j))

                # description
                articleDataDict["descriptions"] = parsers_common.list_add(
                    articleDataDict["descriptions"], j,
                    parsers_common.get(articlePostsDict["descriptions"], j))

                # pubDates magic from "15.01.2012 23:49" to datetime()
                curArtPubDate = parsers_common.get(
                    articlePostsDict["pubDates"], j)
                curArtPubDate = parsers_datetime.replace_string_with_timeformat(
                    curArtPubDate, "Eile", "%d.%m.%Y", offsetDays=-1)
                curArtPubDate = parsers_datetime.add_missing_date_to_string(
                    curArtPubDate, "%d.%m.%Y %H:%M", "%d.%m.%Y ")
                curArtPubDate = parsers_datetime.raw_to_datetime(
                    curArtPubDate, "%d.%m.%Y %H:%M")
                articleDataDict["pubDates"] = parsers_common.list_add(
                    articleDataDict["pubDates"], j, curArtPubDate)

                # title
                curArtTitle = parsers_common.get(parentPages["titles"], i)
                curArtTitle = parsers_common.str_title_at_domain(
                    curArtTitle, domain)
                articleDataDict["titles"] = parsers_common.list_add(
                    articleDataDict["titles"], j, curArtTitle)

                # url
                curArtUrl = parentPages["urls"][i] + articlePostsDict["urls"][j]
                articleDataDict["urls"] = parsers_common.list_add(
                    articleDataDict["urls"], j, curArtUrl)

                rss_print.print_debug(
                    __file__, "teema postitus nr. " + str(j + 1) + "/(" +
                    str(len(articlePostsDict["urls"])) + ") on " +
                    articlePostsDict["urls"][j], 2)

    return articleDataDict