def fill_article_dict(articleDataDict, pageTree, domain): maxArticleBodies = min(rss_config.REQUEST_ARTICLE_BODIES_MAX, 10) maxArticlePosts = round(rss_config.REQUEST_ARTICLE_POSTS_MAX / maxArticleBodies) # set 0 for all posts parentPages = {} parentPages["stamps"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="reply-count"]/span[@data-xf-init="tooltip"]/text()') parentPages["titles"] = parsers_common.xpath_to( "list", pageTree, '//div[@qid="thread-item-parent"]/div[@qid="thread-item"]/div/div[@class="structItem-title"]/a[@qid="thread-item-title"]/text()' ) parentPages["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@qid="thread-item-parent"]/div[@qid="thread-item"]/div/div[@class="structItem-title"]/a[@qid="thread-item-title"]/@href' ) # remove unwanted content: titles dictList = [ "$", "*:::::::the official what did you do to you mkiv today thread::::::::*", "??", "Ask a Simple Question", ] parentPages = parsers_common.article_data_dict_clean( parentPages, dictList, "in", "titles") # teemade läbivaatamine for i in parsers_common.article_urls_range(parentPages["urls"]): # teemalehe sisu hankimine if parsers_common.should_get_article_body(i, maxArticleBodies): curParentUrl = parsers_common.get(parentPages["urls"], i) curParentUrl = curParentUrl + "page-1000" parentPagesStamp = parsers_common.get(parentPages["stamps"], i) # load article into tree pageTree = parsers_common.get_article_tree( domain, curParentUrl, cache='cacheStamped', pageStamp=parentPagesStamp) articlePostsDict = {} articlePostsDict["authors"] = parsers_common.xpath_to( "list", pageTree, '//h4[@qid="message-username"]//text()') articlePostsDict["descriptions"] = parsers_common.xpath_to( "list", pageTree, '//article/div[@class="bbWrapper"]', parent=True) articlePostsDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="message-attribution-main"]/a[@class="u-concealed"][2]/time/@datetime' ) articlePostsDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="message-attribution-main"]/a[@class="u-concealed"][2]/@href' ) # teema postituste läbivaatamine for j in parsers_common.article_posts_range( articlePostsDict["urls"], maxArticlePosts): # author articleDataDict["authors"] = parsers_common.list_add( articleDataDict["authors"], j, parsers_common.get(articlePostsDict["authors"], j)) # description curArtDesc = parsers_common.get( articlePostsDict["descriptions"], j) articleDataDict["descriptions"] = parsers_common.list_add( articleDataDict["descriptions"], j, curArtDesc) # pubDates curArtPubDate = parsers_common.get( articlePostsDict["pubDates"], j) curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%Y-%m-%dT%H:%M:%S%z") # 2021-01-28T16:15:42-0500 articleDataDict["pubDates"] = parsers_common.list_add( articleDataDict["pubDates"], j, curArtPubDate) # title curArtTitle = parsers_common.get(parentPages["titles"], i) curArtTitle = parsers_common.str_title_at_domain( curArtTitle, domain) articleDataDict["titles"] = parsers_common.list_add( articleDataDict["titles"], j, curArtTitle) # url curArtUrl = parsers_common.get(articlePostsDict["urls"], j) articleDataDict["urls"] = parsers_common.list_add( articleDataDict["urls"], j, curArtUrl) rss_print.print_debug( __file__, "teema " + str(i + 1) + " postitus nr. " + str(j + 1) + "/(" + str(len(articlePostsDict["urls"])) + ") on " + articlePostsDict["urls"][j], 2) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): maxArticleBodies = min(rss_config.REQUEST_ARTICLE_BODIES_MAX, 5) maxArticlePosts = round(rss_config.REQUEST_ARTICLE_POSTS_MAX / maxArticleBodies) # set 0 for all posts parentPages = {} parentPages["stamps"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="tabs-content tab-active shadow"]/ul[@class="adds-list list"]/li[@class="adds-list-item"]/a[h3/@class="adds-list-title"]/h3[@class="adds-list-title"]/span[@class="adds-list-meta"]/text()' ) parentPages["titles"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="tabs-content tab-active shadow"]/ul[@class="adds-list list"]/li[@class="adds-list-item"]/a[h3/@class="adds-list-title"]/h3[@class="adds-list-title"]/text()' ) parentPages["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="tabs-content tab-active shadow"]/ul[@class="adds-list list"]/li[@class="adds-list-item"]/a[h3/@class="adds-list-title"]/@href' ) # muudame suuna sobivaks parentPages = parsers_common.dict_reverse_order(parentPages) # remove unwanted content: titles dictList = [ "AMD", "Apple", "Assassin", "Batman", "Battlefield", "Call of Duty", "Cyberpunk", "Diablo 2", "Dying", "Escape From Tarkov", "Euro Truck", "Evil", "FIFA", "Far Cry", "Forza", "Galaxy", "Grand Theft", "IPhon", "Kindle", "MMORPG", "MSI", "MacBook", "MacOS", "Mafia", "Mass Effect", "Meizu", "Minecraft", "Nintendo", "Pixel", "PlayStation", "Steam", "Tanks", "Vidia", "War Thunder", "Watercool", "Windows", "Xbox", "arvutikast", "exile", "foorumiga seotud", "konsool", "korpust", "moderaatorite", "seotud vead", "siia lingid", "toiteplok", ] parentPages = parsers_common.article_data_dict_clean( parentPages, dictList, "in", "titles") # teemade läbivaatamine for i in parsers_common.article_urls_range(parentPages["urls"]): # teemalehe sisu hankimine if parsers_common.should_get_article_body(i, maxArticleBodies): parentPages["stamps"][i] = parentPages["stamps"][i].split("/")[0] parentPages["urls"][i] = parentPages["urls"][i].split("&sid=")[0] pageTree = parsers_common.get_article_tree( domain, parentPages["urls"][i], cache='cacheStamped', pageStamp=parentPages["stamps"][i]) articlePostsDict = {} articlePostsDict["authors"] = parsers_common.xpath_to( "list", pageTree, '//table[@class="forumline"]/tr/td[1]/span[@class="name"]/b/a/text()' ) articlePostsDict["descriptions1"] = parsers_common.xpath_to( "list", pageTree, '//table[@class="forumline"]/tr/td[2]/table/tr[3]/td/span[@class="postbody"][1]', parent=True) articlePostsDict["descriptions2"] = parsers_common.xpath_to( "list", pageTree, '//table[@class="forumline"]/tr/td[2]/table/tr[3]/td/span[@class="postbody"][2]', parent=True) articlePostsDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//table[@class="forumline"]/tr/td[2]/table/tr[1]/td/span[@class="postdetails"]/span[@class="postdetails"][1]/text()[1]' ) articlePostsDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//table[@class="forumline"]/tr/td[2]/table/tr[1]/td/span[@class="postdetails"]/a/@href' ) # teema postituste läbivaatamine for j in parsers_common.article_posts_range( articlePostsDict["urls"], maxArticlePosts): # author articleDataDict["authors"] = parsers_common.list_add( articleDataDict["authors"], j, parsers_common.get(articlePostsDict["authors"], j)) # description curArtDesc = parsers_common.get( articlePostsDict["descriptions1"], j) if not curArtDesc: curArtDesc = parsers_common.get( articlePostsDict["descriptions2"], j) curArtDesc = curArtDesc.replace( '</div><div class="quotecontent">', '<br>') curArtDesc = parsers_common.fix_quatation_tags( curArtDesc, '<div class="quotetitle">', "</div>", "<blockquote>", "</blockquote>") articleDataDict["descriptions"] = parsers_common.list_add( articleDataDict["descriptions"], j, curArtDesc) # pubDates curArtPubDate = parsers_common.get( articlePostsDict["pubDates"], j) curArtPubDate = curArtPubDate[0:16] curArtPubDate = parsers_datetime.guess_datetime( curArtPubDate) # 14.07.2020 07:59 articleDataDict["pubDates"] = parsers_common.list_add( articleDataDict["pubDates"], j, curArtPubDate) # title curArtTitle = parsers_common.get(parentPages["titles"], i) curArtTitle = parsers_common.str_title_at_domain( curArtTitle, domain) articleDataDict["titles"] = parsers_common.list_add( articleDataDict["titles"], j, curArtTitle) # url curArtUrl = parsers_common.get(articlePostsDict["urls"], j) curArtUrl = parsers_common.link_add_end( curArtUrl, articlePostsDict["urls"][j]) articleDataDict["urls"] = parsers_common.list_add( articleDataDict["urls"], j, curArtUrl) rss_print.print_debug( __file__, "teema " + str(i + 1) + " postitus nr. " + str(j + 1) + "/(" + str(len(articlePostsDict["urls"])) + ") on " + articlePostsDict["urls"][j], 2) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): maxArticleBodies = min(rss_config.REQUEST_ARTICLE_BODIES_MAX, 5) maxArticlePosts = round(rss_config.REQUEST_ARTICLE_POSTS_MAX / maxArticleBodies) # set 0 for all posts parentPages = {} parentPages["stamps"] = parsers_common.xpath_to( "list", pageTree, '//div[@data-lang="Vastuseid"]/a/text()') parentPages["titles"] = parsers_common.xpath_to( "list", pageTree, '//span[@class=" subject_old"]/a/text()') parentPages["urls"] = parsers_common.xpath_to( "list", pageTree, '//span[@class=" subject_old"]/a/@href') # remove unwanted content: titles #dictList = [] #parentPages = parsers_common.article_data_dict_clean(parentPages, dictList, "in", "titles") # teemade läbivaatamine for i in parsers_common.article_urls_range(parentPages["urls"]): # teemalehe sisu hankimine if parsers_common.should_get_article_body(i, maxArticleBodies): curParentUrl = parsers_common.get(parentPages["urls"], i) curParentUrl = curParentUrl.split("&sid=")[0] curParentUrl = curParentUrl + "&page=-1" curParentUrl = parsers_common.str_domain_url(domain, curParentUrl) parentPagesStamp = parsers_common.get(parentPages["stamps"], i) # load article into tree pageTree = parsers_common.get_article_tree( domain, curParentUrl, cache='cacheStamped', pageStamp=parentPagesStamp) articlePostsDict = {} articlePostsDict["authors"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="author_information"]/strong/span[@class="largetext"]/a/text()' ) articlePostsDict["descriptions"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="post_body scaleimages"]', parent=True) articlePostsDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="post_head"]/span[@class="post_date"]', parent=True) articlePostsDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="post_content"]/div[@class="post_head"]/div/strong/a/@href' ) # teema postituste läbivaatamine for j in parsers_common.article_posts_range( articlePostsDict["urls"], maxArticlePosts): # author articleDataDict["authors"] = parsers_common.list_add( articleDataDict["authors"], j, parsers_common.get(articlePostsDict["authors"], j)) # description curArtDesc = parsers_common.get( articlePostsDict["descriptions"], j) curArtDesc = curArtDesc.replace( '</div><div class="quotecontent">', '<br>') curArtDesc = parsers_common.fix_quatation_tags( curArtDesc, '<div class="quotetitle">', "</div>", "<blockquote>", "</blockquote>") articleDataDict["descriptions"] = parsers_common.list_add( articleDataDict["descriptions"], j, curArtDesc) # pubDates curArtPubDate = parsers_common.get( articlePostsDict["pubDates"], j) curArtPubDate = parsers_common.str_lchop( curArtPubDate, '<span title="') curArtPubDate = curArtPubDate.split(" <span class")[0] if "Eile" in curArtPubDate or "Täna" in curArtPubDate: curArtPubDate = curArtPubDate.split('">')[1] curArtPubDate = parsers_datetime.replace_string_with_timeformat( curArtPubDate, "Eile</span>", "%d-%m-%Y", offsetDays=-1) curArtPubDate = parsers_datetime.replace_string_with_timeformat( curArtPubDate, "Täna</span>", "%d-%m-%Y", offsetDays=0) else: curArtPubDate = curArtPubDate.split('">')[0] curArtPubDate = parsers_datetime.guess_datetime(curArtPubDate) articleDataDict["pubDates"] = parsers_common.list_add( articleDataDict["pubDates"], j, curArtPubDate) # title curArtTitle = parsers_common.get(parentPages["titles"], i) curArtTitle = parsers_common.str_title_at_domain( curArtTitle, domain) articleDataDict["titles"] = parsers_common.list_add( articleDataDict["titles"], j, curArtTitle) # url curArtUrl = parsers_common.get(articlePostsDict["urls"], j) curArtUrl = parsers_common.link_add_end( curArtUrl, articlePostsDict["urls"][j]) articleDataDict["urls"] = parsers_common.list_add( articleDataDict["urls"], j, curArtUrl) rss_print.print_debug( __file__, "teema " + str(i + 1) + " postitus nr. " + str(j + 1) + "/(" + str(len(articlePostsDict["urls"])) + ") on " + articlePostsDict["urls"][j], 2) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): maxArticleBodies = min(rss_config.REQUEST_ARTICLE_BODIES_MAX, 8) maxArticlePosts = round(rss_config.REQUEST_ARTICLE_POSTS_MAX / maxArticleBodies) # set 0 for all posts parentPages = {} version = 0 if "kipper.ee" in domain or "militaar.net" in domain: version = 1 parentPages["stamps"] = parsers_common.xpath_to("list", pageTree, '//table[@class="tablebg"]//tr/td/p[@class="topicdetails"]/text()') parentPages["titles"] = parsers_common.xpath_to("list", pageTree, '//table[@class="tablebg"]//tr/td/a[@class="topictitle"]/text()') parentPages["urls"] = parsers_common.xpath_to("list", pageTree, '//table[@class="tablebg"]//tr/td/a[@class="topictitle"]/@href') else: parentPages["stamps"] = parsers_common.xpath_to("list", pageTree, '//ul[2]/li/dl/dd[@class="posts"]/text()') parentPages["titles"] = parsers_common.xpath_to("list", pageTree, '//ul[2]/li/dl/*/div/a[@class="topictitle"]/text()') parentPages["urls"] = parsers_common.xpath_to("list", pageTree, '//ul[2]/li/dl/*/div/a[@class="topictitle"]/@href') if not parentPages["urls"] and "arutelud.com" in domain: rss_print.print_debug(__file__, "aktiivseid teemasid ei leitud, arutelude foorumis külastame mammutteemat", 1) parentPages["stamps"] = parsers_common.list_add_or_assign(parentPages["stamps"], 0, "") parentPages["titles"] = parsers_common.list_add_or_assign(parentPages["titles"], 0, "Arutelud") parentPages["urls"] = parsers_common.list_add_or_assign(parentPages["urls"], 0, "https://arutelud.com/viewtopic.php?f=3&t=4&sd=d&sk=t&st=7") # remove unwanted content: titles dictList = [ "Race.Fi:", "Write my", "PÕLVAMAA, VÕRUMAA JA VALGAMAA CB JA HAM SIDE", ] parentPages = parsers_common.article_data_dict_clean(parentPages, dictList, "in", "titles") # teemade läbivaatamine for i in parsers_common.article_urls_range(parentPages["urls"]): # teemalehe sisu hankimine if parsers_common.should_get_article_body(i, maxArticleBodies): curParentUrl = parsers_common.get(parentPages["urls"], i) curParentUrl = curParentUrl.split("&sid=")[0] curParentUrl = curParentUrl + "&start=100000" parentPagesStamp = parsers_common.get(parentPages["stamps"], i) # load article into tree pageTree = parsers_common.get_article_tree(domain, curParentUrl, cache='cacheStamped', pageStamp=parentPagesStamp) articlePostsDict = {} if version in (1, 2): rss_print.print_debug(__file__, "kasutame spetsiifilist hankimist, domain = " + domain, 2) articlePostsDict["authors"] = parsers_common.xpath_to("list", pageTree, '//tr/td/b[@class="postauthor"]/text()') articlePostsDict["descriptions"] = parsers_common.xpath_to("list", pageTree, '//tr/td/div[@class="postbody"][1]', parent=True) articlePostsDict["pubDates"] = parsers_common.xpath_to("list", pageTree, '//tr/td[@class="gensmall"]/div[@style="float: right;"]/text()') articlePostsDict["urls"] = parsers_common.xpath_to("list", pageTree, '//tr/td[@class="gensmall"]/div[@style="float: right;"]/a/@href') else: rss_print.print_debug(__file__, "kasutame üldist hankimist, domain = " + domain, 3) articlePostsDict["authors"] = parsers_common.xpath_to("list", pageTree, '//p[@class="author"]//strong//text()') articlePostsDict["descriptions"] = parsers_common.xpath_to("list", pageTree, '//div[@class="content"]', parent=True) articlePostsDict["pubDates"] = parsers_common.xpath_to("list", pageTree, '//p[@class="author"]/time/@datetime') articlePostsDict["urls"] = parsers_common.xpath_to("list", pageTree, '//p[@class="author"]/a/@href') if not articlePostsDict["pubDates"]: articlePostsDict["pubDates"] = parsers_common.xpath_to("list", pageTree, '//p[@class="author"]/time/text()') if not articlePostsDict["pubDates"]: articlePostsDict["pubDates"] = parsers_common.xpath_to("list", pageTree, '//p[@class="author"]/text()[1]') if len(articlePostsDict["pubDates"][0]) < 5: rss_print.print_debug(__file__, "hangitud aeg[0] liiga lühike: '" + articlePostsDict["pubDates"][0] + "', proovime alternatiivi...", 0) articlePostsDict["pubDates"] = parsers_common.xpath_to("list", pageTree, '//p[@class="author"]/text()[2]') if len(articlePostsDict["pubDates"][0]) < 5: rss_print.print_debug(__file__, "hangitud aeg[0] liiga lühike: '" + articlePostsDict["pubDates"][0] + "'", 0) else: rss_print.print_debug(__file__, "hangitud aeg[0]: '" + articlePostsDict["pubDates"][0] + "'", 4) if not articlePostsDict["pubDates"]: rss_print.print_debug(__file__, "ei suutnud hankida ühtegi aega", 0) # teema postituste läbivaatamine for j in parsers_common.article_posts_range(articlePostsDict["urls"], maxArticlePosts): # author articleDataDict["authors"] = parsers_common.list_add(articleDataDict["authors"], j, parsers_common.get(articlePostsDict["authors"], j)) # description curArtDesc = parsers_common.get(articlePostsDict["descriptions"], j) curArtDesc = curArtDesc.replace('</div><div class="quotecontent">', '<br>') curArtDesc = parsers_common.fix_quatation_tags(curArtDesc, '<div class="quotetitle">', "</div>", "<blockquote>", "</blockquote>") articleDataDict["descriptions"] = parsers_common.list_add(articleDataDict["descriptions"], j, curArtDesc) # pubDates curArtPubDate = parsers_common.get(articlePostsDict["pubDates"], j) curArtPubDate = parsers_datetime.months_to_int(curArtPubDate) curArtPubDate = parsers_datetime.remove_weekday_strings(curArtPubDate) curArtPubDate = parsers_datetime.replace_string_with_timeformat(curArtPubDate, "eile", "%d %m %Y", offsetDays=-1) curArtPubDate = parsers_datetime.replace_string_with_timeformat(curArtPubDate, "täna", "%d %m %Y", offsetDays=0) curArtPubDate = parsers_datetime.guess_datetime(curArtPubDate) articleDataDict["pubDates"] = parsers_common.list_add(articleDataDict["pubDates"], j, curArtPubDate) # title curArtTitle = parsers_common.get(parentPages["titles"], i) curArtTitle = parsers_common.str_title_at_domain(curArtTitle, domain) articleDataDict["titles"] = parsers_common.list_add(articleDataDict["titles"], j, curArtTitle) # url curArtUrl = parsers_common.get(articlePostsDict["urls"], j) curArtUrl = parsers_common.link_add_end(curArtUrl, articlePostsDict["urls"][j]) articleDataDict["urls"] = parsers_common.list_add(articleDataDict["urls"], j, curArtUrl) rss_print.print_debug(__file__, "teema " + str(i + 1) + " postitus nr. " + str(j + 1) + "/(" + str(len(articlePostsDict["urls"])) + ") on " + articlePostsDict["urls"][j], 2) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): maxArticleBodies = min(rss_config.REQUEST_ARTICLE_BODIES_MAX, 1) maxArticlePosts = round(rss_config.REQUEST_ARTICLE_POSTS_MAX / maxArticleBodies) # set 0 for all posts parentPages = {} parentPages["stamps"] = parsers_common.xpath_to( "list", pageTree, '//tbody/tr/th[@class="col-1"]/text()') parentPages["titles"] = parsers_common.xpath_to( "list", pageTree, '//tbody/tr/th[@class="col-7 teemapealkiri"]/a/text()') parentPages["urls"] = parsers_common.xpath_to( "list", pageTree, '//tbody/tr/th[@class="col-4"]/a/@href') # remove unwanted content: titles dictList = [ "Lõvide perekonna uus teema", "abort", "beebi", "ivf", "lapse", "rase ", "rased", "triibupüüdjad", ] parentPages = parsers_common.article_data_dict_clean( parentPages, dictList, "in", "titles") # teemade läbivaatamine for i in parsers_common.article_urls_range(parentPages["urls"]): # teemalehe sisu hankimine if parsers_common.should_get_article_body(i, maxArticleBodies): curParentUrl = parsers_common.get(parentPages["urls"], i) curParentUrl = curParentUrl.split("/#")[0] pageTree = parsers_common.get_article_tree( domain, curParentUrl, cache='cacheStamped', pageStamp=parentPages["stamps"][i]) articlePostsDict = {} articlePostsDict["descriptions"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="bbp-reply-content entry-content"]', parent=True) articlePostsDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="post_date date updated"]/text()') articlePostsDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="bbp-reply-header entry-title"]/@id') # teema postituste läbivaatamine for j in parsers_common.article_posts_range( articlePostsDict["urls"], maxArticlePosts): # description curArtDesc = parsers_common.get( articlePostsDict["descriptions"], j) curArtDesc = curArtDesc.split( '<div class="gdrts-rating-block ')[0] curArtDesc = parsers_html.html_remove_single_parents( curArtDesc) articleDataDict["descriptions"] = parsers_common.list_add( articleDataDict["descriptions"], j, curArtDesc) # pubDates curArtPubDate = parsers_common.get( articlePostsDict["pubDates"], j) curArtPubDate = parsers_datetime.guess_datetime(curArtPubDate) articleDataDict["pubDates"] = parsers_common.list_add( articleDataDict["pubDates"], j, curArtPubDate) # title curArtTitle = parsers_common.get(parentPages["titles"], i) curArtTitle = parsers_common.str_title_at_domain( curArtTitle, domain) articleDataDict["titles"] = parsers_common.list_add( articleDataDict["titles"], j, curArtTitle) # url curArtUrl = parsers_common.get( parentPages["urls"], i) + "/#" + articlePostsDict["urls"][j] articleDataDict["urls"] = parsers_common.list_add( articleDataDict["urls"], j, curArtUrl) rss_print.print_debug( __file__, "teema " + str(i + 1) + " postitus nr. " + str(j + 1) + "/(" + str(len(articlePostsDict["urls"])) + ") on " + articlePostsDict["urls"][j], 2) # remove unwanted content: descriptions dictList = [ " liba " "Kommentaar eemaldatud.", "Liba?", ] articleDataDict = parsers_common.article_data_dict_clean( articleDataDict, dictList, "in", "descriptions") return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): maxArticleBodies = min(rss_config.REQUEST_ARTICLE_BODIES_MAX, 5) maxArticlePosts = round(rss_config.REQUEST_ARTICLE_POSTS_MAX / maxArticleBodies) # set 0 for all posts parentPages = {} parentPages["stamps"] = parsers_common.xpath_to( "list", pageTree, '//table[@class="grid zebra forum"]/tr/td[@class="meta"][4]/span/text()' ) parentPages["titles"] = parsers_common.xpath_to( "list", pageTree, '//table[@class="grid zebra forum"]/tr/td[@class="title"]/a/@title') parentPages["urls"] = parsers_common.xpath_to( "list", pageTree, '//table[@class="grid zebra forum"]/tr/td[@class="title"]/a/@href') # remove unwanted content: titles dictList = [ "Börsihai", "Cleveroni aktsiate ost/müük/oksjon", "Head uut aastat – prognoosid", "Keegi malet soovib mängida", "LHV Pank paremaks", "Uurimis- ja lõputööde küsimustikud", ] parentPages = parsers_common.article_data_dict_clean( parentPages, dictList, "in", "titles") # teemade läbivaatamine for i in parsers_common.article_urls_range(parentPages["urls"]): # teemalehe sisu hankimine if parsers_common.should_get_article_body(i, maxArticleBodies): # load article into tree pageTree = parsers_common.get_article_tree( domain, parentPages["urls"][i] + '?listEventId=jumpToPage&listEventParam=100&pagesOfMaxSize=true', cache='cacheStamped', pageStamp=parentPages["stamps"][i]) articlePostsDict = {} articlePostsDict["authors"] = parsers_common.xpath_to( "list", pageTree, '//ul[@class="forum-topic"]/li/div[@class="col2"]/div[@class="forum-header clear"]/p[@class="author"]/strong/a/text()' ) articlePostsDict["descriptions"] = parsers_common.xpath_to( "list", pageTree, '//ul[@class="forum-topic"]/li/div[@class="col2"]/div[@class="forum-content temporary-class"]', parent=True) articlePostsDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//ul[@class="forum-topic"]/li/div[@class="col2"]/div[@class="forum-header clear"]/div/p[@class="permalink"]/a/node()' ) articlePostsDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//ul[@class="forum-topic"]/li/div[@class="col2"]/div[@class="forum-header clear"]/div/p[@class="permalink"]/a/@href' ) # teema postituste läbivaatamine for j in parsers_common.article_posts_range( articlePostsDict["urls"], maxArticlePosts): # author articleDataDict["authors"] = parsers_common.list_add( articleDataDict["authors"], j, parsers_common.get(articlePostsDict["authors"], j)) # description articleDataDict["descriptions"] = parsers_common.list_add( articleDataDict["descriptions"], j, parsers_common.get(articlePostsDict["descriptions"], j)) # pubDates magic from "15.01.2012 23:49" to datetime() curArtPubDate = parsers_common.get( articlePostsDict["pubDates"], j) curArtPubDate = parsers_datetime.replace_string_with_timeformat( curArtPubDate, "Eile", "%d.%m.%Y", offsetDays=-1) curArtPubDate = parsers_datetime.add_missing_date_to_string( curArtPubDate, "%d.%m.%Y %H:%M", "%d.%m.%Y ") curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%d.%m.%Y %H:%M") articleDataDict["pubDates"] = parsers_common.list_add( articleDataDict["pubDates"], j, curArtPubDate) # title curArtTitle = parsers_common.get(parentPages["titles"], i) curArtTitle = parsers_common.str_title_at_domain( curArtTitle, domain) articleDataDict["titles"] = parsers_common.list_add( articleDataDict["titles"], j, curArtTitle) # url curArtUrl = parentPages["urls"][i] + articlePostsDict["urls"][j] articleDataDict["urls"] = parsers_common.list_add( articleDataDict["urls"], j, curArtUrl) rss_print.print_debug( __file__, "teema postitus nr. " + str(j + 1) + "/(" + str(len(articlePostsDict["urls"])) + ") on " + articlePostsDict["urls"][j], 2) return articleDataDict