def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit): """ Meetod uudistesaidi kõigi uudiste nimekirja loomiseks """ articleDescriptions = [] articleIds = [] articleImages = [] articlePubDates = [] articleTitles = pageTree.xpath('//div[@class="forum"][2]/ul/li/a/text()') articleUrls = pageTree.xpath('//div[@class="forum"][2]/ul/li/a/@href') articleUrls = parsers_common.domainUrls(domain, articleUrls) get_article_bodies = True for i in range(0, len(articleUrls)): articleUrl = articleUrls[i] # get unique id from articleUrl articleIds.append( articleUrl[articleUrl.index('&id=') + 4:articleUrl.index('&', articleUrl.index('&id=') + 4)]) if (get_article_bodies is True and i < maxPageURLstoVisit): # load article into tree articleTree = makereq.getArticleData(articleUrl) # descriptions # curArtDescParent = parsers_common.treeExtract(articleTree, '//div[@id="content"]/div[@class="full_width"]/p[1]') # as a parent # curArtDescChilds = parsers_common.stringify_children(curArtDescParent) # articleDescriptions.append(curArtDescChilds) articleDescriptions.append(extractArticleBody(articleTree)) # images curArtPubImage = parsers_common.treeExtract( articleTree, '//div[@id="content"]/div[@class="full_width"]/a/img[@class="thumb"]/@src' ) articleImages.append(curArtPubImage) # timeformat magic from "13/12/2017 22:24:59" to to datetime() curArtPubDate = parsers_common.treeExtract( articleTree, '//div[@id="content"]/div[@class="full_width"]/p[*]/i/b[2]/text()' ) curArtPubDate = parsers_common.rawToDatetime( curArtPubDate, "%d/%m/%Y %H:%M:%S") articlePubDates.append(curArtPubDate) articleImages = parsers_common.domainUrls(domain, articleImages) return { "articleDescriptions": articleDescriptions, "articleIds": articleIds, "articleImages": articleImages, "articlePubDates": articlePubDates, "articleTitles": articleTitles, "articleUrls": articleUrls, }
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit): """ Meetod Tartu aarete nimekirja loomiseks """ articleDescriptions = [] articleIds = [] # articleImages = [] articlePubDates = pageTree.xpath( '//div[@id="t-content"]/table[1]/tr/td[1]/text()') articleTitles = pageTree.xpath( '//div[@id="t-content"]/table[1]/tr/td[@class="left"]/b/a/text()') articleUrls = pageTree.xpath( '//div[@id="t-content"]/table[1]/tr/td[@class="left"]/b/a/@href') articleUrls = parsers_common.domainUrls(domain, articleUrls) articleDescriptionsParents = pageTree.xpath( '//div[@id="t-content"]/table[1]/tr') # as a parent for i in range(0, len(articleUrls)): articleUrl = articleUrls[i] # get unique id from articleUrl articleIds.append(articleUrl.split('/')[-1]) # descriptions curArtDescParent = articleDescriptionsParents[i] curArtDescChilds = parsers_common.stringify_children(curArtDescParent) articleDescriptions.append(curArtDescChilds) # timeformat magic from "12.12.2017" to datetime() curArtPubDate = articlePubDates[i] curArtPubDate = parsers_common.rawToDatetime(curArtPubDate, "%d.%m.%Y") articlePubDates[i] = curArtPubDate # remove non "Tartu" ocation lines retArticleDescriptions = [] retArticleIds = [] retArticleImages = [] retArticlePubDates = [] retArticleTitles = [] retArticleUrls = [] for i in range(0, len(articleUrls)): if ('Tartu' in articleDescriptions[i]): retArticleDescriptions.append(articleDescriptions[i]) retArticleIds.append(articleIds[i]) # retArticleImages.append(articleImages[i]) retArticlePubDates.append(articlePubDates[i]) retArticleTitles.append(articleTitles[i]) retArticleUrls.append(articleUrls[i]) return { "articleDescriptions": retArticleDescriptions, "articleIds": retArticleIds, "articleImages": retArticleImages, "articlePubDates": retArticlePubDates, "articleTitles": retArticleTitles, "articleUrls": retArticleUrls, }
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit): """ Meetod uudistesaidi kõigi uudiste nimekirja loomiseks """ articleDescriptions = pageTree.xpath( '//div[@id="nsp-nsp-234"]//div[@class="nspArt nspCol1"]/div[@class="gkArtContentWrap"]/p[1]/text()' ) articleIds = [] articleImages = pageTree.xpath( '//div[@id="nsp-nsp-234"]//div[@class="nspArt nspCol1"]/a/img/@src') articlePubDates = [] articleTitles = pageTree.xpath( '//div[@id="nsp-nsp-234"]//div[@class="nspArt nspCol1"]/div[@class="gkArtContentWrap"]/h4/a/text()' ) articleUrls = pageTree.xpath( '//div[@id="nsp-nsp-234"]//div[@class="nspArt nspCol1"]/div[@class="gkArtContentWrap"]/h4/a/@href' ) articleUrls = parsers_common.domainUrls(domain, articleUrls) # todo(reading times from articles is BROKEN and maybe useless too) get_article_bodies = False for i in range(0, len(articleUrls)): articleUrl = articleUrls[i] # generate unique id from articleUrl articleIds.append(parsers_common.urlToHash(articleUrl)) if (get_article_bodies is True and i < maxPageURLstoVisit): # load article into tree articleTree = makereq.getArticleData(articleUrl) # timeformat magic from "Avaldatud: Neljapäev, 14 Detsember 2017 12:46" to datetime() # curArtPubDate = parsers_common.treeExtract(articleTree, '//div[@class="kakk-postheadericons kakk-metadata-icons"]/span/::before/text()') # katki curArtPubDate = parsers_common.treeExtract( articleTree, '//span[@class="kakk-postdateicon"]//text()') curArtPubDate = parsers_common.longMonthsToNumber( curArtPubDate.split(',')[1]) curArtPubDate = parsers_common.rawToDatetime( curArtPubDate, "%d %m %Y %H:%M") articlePubDates.append(curArtPubDate) return { "articleDescriptions": articleDescriptions, "articleIds": articleIds, "articleImages": articleImages, "articlePubDates": articlePubDates, "articleTitles": articleTitles, "articleUrls": articleUrls, }
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit): """ Meetod kõigi pakkumiste nimekirja loomiseks """ articleDescriptions = [] articleIds = [] articleImages = pageTree.xpath( '//div[@class="product_camp_box w"]/a/div/div[@class="leftC"]/div/img/@src' ) articlePubDates = [] articleTitles = pageTree.xpath( '//div[@class="product_camp_box w"]/a/div/div[@class="leftC"]/h3/text()' ) articleUrls = pageTree.xpath( '//div[@class="product_camp_box w"]/a/@href') articleUrls = parsers_common.domainUrls(domain, articleUrls) articleDescriptionsParents = pageTree.xpath( '//div[@class="product_camp_box w"]/a/div/div[@class="leftC"]' ) # as a parent for i in range(0, len(articleUrls)): articleUrl = articleUrls[i] # get unical id from articleUrl articleIds.append(articleUrl.split('/')[-1].lstrip('-')) # descriptions curArtDescParent = articleDescriptionsParents[i] curArtDescChilds = parsers_common.stringify_children(curArtDescParent) articleDescriptions.append(curArtDescChilds) return { "articleDescriptions": articleDescriptions, "articleIds": articleIds, "articleImages": articleImages, "articlePubDates": articlePubDates, "articleTitles": articleTitles, "articleUrls": articleUrls, }
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit): """ Meetod uudistesaidi kõigi uudiste nimekirja loomiseks """ articleDescriptions = pageTree.xpath( '//div[@class="news-list-item-wrapper"]/div[@class="news-list-item-excerpt"]/p/text()' ) articleIds = [] articleImages = pageTree.xpath('//div[@class="news-list-media"]/img/@src') articleImages = parsers_common.domainUrls(domain, articleImages) articlePubDates = [] articleTitles = pageTree.xpath( '//div[@class="news-list-item-wrapper"]/h3/a/text()') articleUrls = pageTree.xpath( '//div[@class="news-list-item-wrapper"]/h3/a/@href') articleUrls = parsers_common.domainUrls(domain, articleUrls) articlePubDay = pageTree.xpath( '//div[@class="news-list-item-wrapper"]/div[@class="news-list-item-date"]/text()[1]' ) articlePubMonth = pageTree.xpath( '//div[@class="news-list-item-wrapper"]/div[@class="news-list-item-date"]/span[@class="month"]/text()' ) articlePubYear = pageTree.xpath( '//div[@class="news-list-item-wrapper"]/div[@class="news-list-item-date"]/text()[2]' ) get_article_bodies = True for i in range(0, len(articleUrls)): articleUrl = articleUrls[i] # generate unical id from ArticleUrl articleIds.append(parsers_common.urlToHash(articleUrl)) if (get_article_bodies is True and i < maxPageURLstoVisit): # load article into tree articleTree = makereq.getArticleData(articleUrl) # descriptions curArtDescParent = parsers_common.treeExtract( articleTree, '//div[@class="news-single-item"]/div[@class="news-single-content"]' ) # as a parent curArtDescChilds = parsers_common.stringify_children( curArtDescParent) articleDescriptions[i] = curArtDescChilds # timeformat magic from "13 dets 17" to datetime() curArtPubDate = parsers_common.treeExtract( articleTree, '//div[@class="news-single-timedata"]/text()') curArtPubDate = parsers_common.shortMonthsToNumber(curArtPubDate) curArtPubDate = parsers_common.rawToDatetime( curArtPubDate, "%d %m %y") articlePubDates.append(curArtPubDate) else: if i < len(articlePubYear) and (int(articlePubYear[i].strip()) > 2016): curYear = articlePubYear[i].strip() curArtPubDate = articlePubDay[i].strip( ) + " " + articlePubMonth[i].strip() + " " + curYear curArtPubDate = parsers_common.rawToDatetime( curArtPubDate, "%d %m %Y") articlePubDates.append(curArtPubDate) return { "articleDescriptions": articleDescriptions, "articleImages": articleImages, "articleIds": articleIds, "articlePubDates": articlePubDates, "articleTitles": articleTitles, "articleUrls": articleUrls, }