def parse_article(self, response): article = {} # get ld_json try: ld_json = response.xpath( '//script[contains(text(),"NewsArticle")]/text()').get() ld_json_dict = json.loads(ld_json) ld_json_dict = time.timestamp_converter(ld_json_dict) article.update(ld_json_dict) except: pass # get meta elements elems = { 'meta-description': response.xpath("//meta[@name='description']/@content").get(), 'meta-keywords': response.xpath("//meta[@name='keywords']/@content").get(), 'meta-title': response.xpath("//meta[@name='title']/@content").get(), 'meta-copyright': response.xpath("//meta[@name='copyright']/@content").get(), 'meta-author': response.xpath("//meta[@name='author']/@content").get(), 'language': response.xpath('//meta[@http-equiv = "content-language"]/@content').get(), 'geo.placename': response.xpath('//meta[@name = "geo.placename"]/@content').get(), 'geo.position': response.xpath('//meta[@name = "geo.region"]/@content').get(), 'geo.region': response.xpath('//meta[@name = "geo.region"]/@content').get(), 'meta-article:author': response.xpath("//meta[@property='article:author']/@content").get(), 'meta-article:publisher': response.xpath("//meta[@property='article:publisher']/@content").get(), 'category': response.xpath('//p[@class = "the-article-category"]/a/text()').get(), 'organization': 'zing', 'related_urls': response.xpath('//div[@class = "article-list layout-grid-3"]//article/p/a/@href').getall(), 'url': response.url } article.update(elems) article.update(response.meta['viral']) # get content content = '' for text in response.xpath('//*[@id="page-article"]/div[@class="page-wrapper"]/descendant::div[@class = "the-article-body"]/p/text()').getall(): content += text.strip() article.update({'content': content}) word_count = len(content.split()) article.update({'word_count': word_count}) # get image url images = {} for index, src in enumerate(response.xpath('//*[@id="page-article"]/div[@class="page-wrapper"]/descendant::table[@class = "picture"]//img/@src').getall(), 1): images.update({'image' + str(index): src}) article.update({'image-urls': images}) # get video url videos = {} for index, src in enumerate(response.xpath('//figure[@class="video cms-video"]/@data-video-src').getall(), 1): videos.update({'video' + str(index): src}) article.update({'video urls': videos}) # get comments id = response.xpath('//@article-id').get() cmt_request = "https://api.news.zing.vn/api/comment.aspx?action=get&id="+id yield scrapy.Request(cmt_request, callback=self.parse_comments, meta={'article': article})
def parse_article(self, response): article = dict() title = response.xpath('(//h1[@class="title_news_detail mb10"]/text())|(//h1[@class="title"]/text())').get() if title is not None: # get ld_json try: ld_json = response.xpath('//script[contains(text(),"NewsArticle")]/text()').get() ld_json = json.loads(ld_json) ld_json = time.timestamp_converter(ld_json) article.update(ld_json) except: pass if 'datePublished' not in article.keys(): datePublished = response.xpath('(//meta[@name="pubdate"]/@content)').get() if datePublished is not None: datePublished = datePublished.strip() datePublished = time.Vnex_timestamp(datePublished) article.update({'datePublished': datePublished}) else: datePublished = response.xpath('//meta[@name="its_publication"]/@content').get() article.update({'datePublished': datePublished}) if 'dateModified' not in article.keys(): dateModified = response.xpath('(//meta[@itemprop="dateModified"]/@content)').get() if dateModified is not None: dateModified = dateModified.strip() dateModified = time.Vnex_timestamp(dateModified) article.update({'dateModified': dateModified}) else: dateModified = response.xpath('//meta[@name="article_updatetime"]/@content').get() article.update({'dateModified': dateModified}) link = response.url article.update({'link': link, 'title': title}) # get meta article.update({'type': response.xpath("//head/meta[@property='og:type']/@content").get()}) article.update({'description': response.xpath("//head/meta[@name='description']/@content").get()}) article.update({'keywords': response.xpath("//head/meta[@name='keywords']/@content").get()}) article.update({'category': response.xpath("//head/meta[@property='article:section']/@content").get()}) article.update({'copyright': response.xpath("//head/meta[@name='copyright']/@content").get()}) article.update({'language': response.xpath("//head/meta[@name='Language']/@content").get()}) article.update({'geo_place_name': response.xpath("//meta[@name = 'geo.placename']/@content").get()}) article.update({'geo_region': response.xpath("//meta[@name = 'geo.region']/@content").get()}) article.update({'geo_position': response.xpath("//meta[@name = 'geo.position']/@content").get()}) article.update({'category': response.xpath("(//li[@class='start']/h4/a/text())|(//li[@class='start have_cap2 ']/h4/a/text())").get()}) article.update({'organization': 'Vnexpress'}) content = '' author = '' for text in response.xpath('(//section[@class="container"]/section[@class="wrap_sidebar_12"]/section[' '@class="sidebar_1"]/article[@class="content_detail fck_detail width_common ' 'block_ads_connect"]/p[@class="Normal"]/strong/text())|(//p[' '@class="author_mail"]/strong/text())|(//p[' '@style="text-align:right;"]/strong/text())').getall(): author += text.strip() article.update({'author': author}) for text in response.xpath('(//article[@class="content_detail fck_detail width_common ' 'block_ads_connect"]/p/text())|(//div[@class="desc_cation"]/p/text())|(//div[' '@class="desc_cation"]/p/strong/text())|(//div[contains(@class,' '"box_tableinsert") or contains(@class,"box_quangcao") or contains(@class,' '"box_brief_info")]//p//text())|(//div[@class="WordSection1"]/p/text())|(//td/p[@class="Image"]/text())').getall(): content += text.strip() article.update({'content_article': content}) if content is not None: word_count = len(content.split()) article.update({'word_count': word_count}) else: word_count = -1 article.update({'word_count': word_count}) # get image thumbnail = response.xpath('(//td/img/@src)|(//div[@class="item_slide_show clearfix"]/div/img/@src)').getall() if thumbnail is not None: article.update({'thumbnail': thumbnail}) else: article.update({'thumbnail': '-1'}) # get relate_url relate_urls = [] htags = response.xpath('//ul[@class="list_title"]/li/a[@data-event-action="article_box_related"]') for tag in htags: relate_url = dict() headline = tag.xpath('/@title').get() url = "https://vnexpress.vn" + str(tag.xpath('/@href').extract_first()) relate_url.update({'headline': headline, 'url': url}) relate_urls.append(relate_url) article.update({"related_url": relate_urls}) # get comment id_article = dict() objectid = response.xpath('//head/meta[@name="tt_article_id"]/@content').get() if objectid is None: return 0 else: objectid = objectid siteid = response.xpath('//head/meta[@name="tt_site_id"]/@content').get() if siteid is None: return 0 else: siteid = siteid categoryid = response.xpath('//head/meta[@name="tt_category_id"]/@content').get() if categoryid is None: return 0 else: categoryid = categoryid id_article.update({'objectid': objectid, 'siteid': siteid, 'categoryid': categoryid}) url_like = response.xpath('//meta[@name="its_url"]/@content').get() if url_like is not None: # get total like like_request = "https://www.facebook.com/plugins/like.php?href=" + url_like + "&layout=button_count" yield scrapy.Request(like_request, callback=self.parse_like, meta={'article': article, 'id_article': id_article}) else: pass
def parse_article(self, response): article = {} # get ld_json try: ld_json = response.xpath( "//script[contains(text(),'NewsArticle')]/text()").get() ld_json_dict = json.loads(ld_json) ld_json_dict = time.timestamp_converter(ld_json_dict) article.update(ld_json_dict) except: pass # get meta elems = { 'meta-description': response.xpath("//meta[@name='description']/@content").get(), 'meta-keywords': response.xpath("//meta[@name='keywords']/@content").get(), 'meta-title': response.xpath("//meta[@name='title']/@content").get(), 'meta-copyright': response.xpath("//meta[@name='copyright']/@content").get(), 'meta-author': response.xpath("//meta[@name='author']/@content").get(), 'language': response.xpath( '//meta[@http-equiv = "content-language"]/@content').get(), 'geo.placename': response.xpath('//meta[@name = "geo.placename"]/@content').get(), 'geo.position': response.xpath('//meta[@name = "geo.region"]/@content').get(), 'geo.region': response.xpath('//meta[@name = "geo.region"]/@content').get(), 'meta-article:author': response.xpath( "//meta[@property='article:author']/@content").get(), 'meta-article:publisher': response.xpath( "//meta[@property='article:publisher']/@content").get(), 'category': response.xpath( '//a[@class = "breadcrumbitem1"][contains(@href, "htm")]/span/text()' ).get(), 'organization': 'dân trí', 'url': response.url, 'related_urls': response.xpath( '//div[@class = "article-oldnew"]//div/div[@class = "article-oldnew-img"]/a/@href' ).getall() } article.update(elems) # get content content = '' for text in response.xpath( '//*[@id="divNewsContent"]/p/text()').getall(): content += text.strip() for text in response.xpath( '//*[@class = "detail-content"]/p/text()').getall(): content += text.strip() for text in response.xpath( '//div[@class="e-body"]//p/text()').getall(): content += text.strip() article.update({'content': content}) word_count = len(content.split()) article.update({'word_count': word_count}) # get image url images = {} index1 = index2 = 0 for index1, src in enumerate( response.xpath('//*[@id="divNewsContent"]//img/@src').getall(), 1): images.update({'image' + str(index1): src}) for index2, src in enumerate( response.xpath( '//*[@class = "detail-content"]//img/@src').getall(), index1 + 1): images.update({'image' + str(index2): src}) for index3, src in enumerate( response.xpath( '//div[@class="e-body"]//figure[contains(@class,"image")]//@src' ).getall(), index2 + 1): images.update({'image' + str(index3): src}) article.update({'image-urls': images}) # get hashtags hashtags = {} for index, href in enumerate( response.xpath( '//span[@class = "news-tags-item"]/a/@href').getall(), 1): hashtags.update({'tag' + str(index): href}) article.update({'hash-tags': hashtags}) # get video url videos = {} for index, src in enumerate( response.xpath( '//div[@class="e-body"]/figure[@class = "video"]//@data-src' ).getall(), 1): videos.update({'video' + str(index): "vcdn.dantri.com.vn/" + src}) article.update(videos) # get likes id = response.xpath('//*[@id="hdNewsId"]/@value').get() if id is not None: like_request = "https://www.facebook.com/v2.3/plugins/like.php?action=like&app_id=164035690775918&channel=https%3A%2F%2Fstaticxx.facebook.com%2Fconnect%2Fxd_arbiter.php%3Fversion%3D44%23cb%3Df31c1be4fdc1a28%26domain%3Ddantri.com.vn%26origin%3Dhttps%253A%252F%252Fdantri.com.vn%252Ff3a046e102e74f4%26relation%3Dparent.parent&container_width=0&href=https%3A%2F%2Fdantri.com.vn%2Fnews-" + \ id+".htm&layout=button_count&locale=vi_VN&sdk=joey&share=false&show_faces=false&size=small" else: id = response.xpath('//*[@id="hidDistID"]/@value').get() if id is not None: like_request = "https://www.facebook.com/plugins/like.php?href="+response.url + \ "&send=false&share=true&layout=standard&width=450&show_faces=false&action=like&colorscheme=light&font&height=35&" else: pv1 = response.url.find('.htm') pv2 = response.url.find('-', pv1 - 20) + 1 id = response.url[pv2:pv1] like_request = "https://www.facebook.com/v2.3/plugins/like.php?action=like&app_id=164035690775918&channel=https%3A%2F%2Fstaticxx.facebook.com%2Fconnect%2Fxd_arbiter.php%3Fversion%3D44%23cb%3Df322cc0314d7894%26domain%3Ddantri.com.vn%26origin%3Dhttps%253A%252F%252Fdantri.com.vn%252Ffe7c5846d65f58%26relation%3Dparent.parent&container_width=0&href=https%3A%2F%2Fdantri.com.vn%2Fnews-" + \ id+".htm&layout=button_count&locale=vi_VN&sdk=joey&share=false&show_faces=false" yield scrapy.Request(like_request, callback=self.parse_likes, meta={ 'article': article, 'id': id })
def parse_article(self, response): atc_type = response.meta['atc_type'] article = {} # get ld_json if atc_type == 'normal': ld_json = response.xpath( '//*[@id="Head1"]//script[contains(text(),"NewsArticle")]/text()' ).get() ld_json_dict = json.loads(ld_json) ld_json_dict = time.timestamp_converter(ld_json_dict) article.update(ld_json_dict) try: cate_json = cate = response.xpath( '//script[contains(text(), "BreadcrumbList")]/text()').get( ).strip() cate_json = json.loads(cate_json) category = cate_json.get('itemListElement')[1].get('item').get( 'name') article.update({'category': category}) except: pass # get meta elements elems = { 'meta-description': response.xpath("//meta[@name='description']/@content").get(), 'meta-keywords': response.xpath("//meta[@name='keywords']/@content").get(), 'meta-title': response.xpath("//meta[@name='title']/@content").get(), 'meta-copyright': response.xpath("//meta[@name='copyright']/@content").get(), 'meta-author': response.xpath("//meta[@name='author']/@content").get(), 'language': response.xpath( '//meta[@http-equiv = "content-language"]/@content').get(), 'geo.placename': response.xpath('//meta[@name = "geo.placename"]/@content').get(), 'geo.position': response.xpath('//meta[@name = "geo.region"]/@content').get(), 'geo.region': response.xpath('//meta[@name = "geo.region"]/@content').get(), 'meta-article:author': response.xpath( "//meta[@property='article:author']/@content").get(), 'meta-article:publisher': response.xpath( "//meta[@property='article:publisher']/@content").get(), 'organization': 'soha', 'url': response.url, # 'related_urls': response.xpath('//div[@class = "article-oldnew"]//div/div[@class = "article-oldnew-img"]/a/@href').getall() } article.update(elems) # get content content = '' for text in response.xpath( '//div[@class = "clearfix news-content"]/p/text()').getall(): content += text article.update({'content': content}) word_count = len(content.split()) article.update({'word_count': word_count}) # get image url images = {} for index, src in enumerate( response.xpath( '//div[@class = "clearfix news-content"]/div[@type = "Photo"]//@src' ).getall(), 1): images.update({'image' + str(index): src}) article.update({'image-urls': images}) # get likes,comments yield scrapy.Request( "https://sharefb.cnnd.vn/?urls=" + response.url, callback=self.parse_interations, headers={ 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Origin': 'https://soha.vn', 'Referer': response.url, 'Sec-Fetch-Mode': 'cors', 'User-Agent': 'Mozilla/5.0 (Windows 10 Win64 x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36' }, meta={ 'article': article, 'atc_type': atc_type })
def parse_item(self, response): article = dict() # get title, link title = response.xpath( '//div[@class="article"]/h1[@class="article__header"]/text()' ).extract_first() if title is not None: # get ld_json try: ld_json = response.xpath( "//script[@type='application/ld+json'][1]/text()").get() ld_json = json.loads(ld_json) ld_json = time.timestamp_converter(ld_json) article.update(ld_json) except: pass # get meta article.update({ 'type': response.xpath( "//head/meta[@property='og:type']/@content").get() }) article.update({ 'description': response.xpath( "//head/meta[@name='description']/@content").get() }) article.update({ 'keywords': response.xpath("//head/meta[@name='keywords']/@content").get() }) article.update({ 'category': response.xpath( "//head/meta[@property='article:section']/@content").get() }) article.update({ 'copyright': response.xpath( "//head/meta[@name='copyright']/@content").get() }) article.update({ 'Language': response.xpath("//head/meta[@name='Language']/@content").get() }) article.update({ 'geo_place_name': response.xpath( "//meta[@name = 'geo.placename']/@content").get() }) article.update({ 'geo_region': response.xpath("//meta[@name = 'geo.region']/@content").get() }) article.update({ 'geo_position': response.xpath( "//meta[@name = 'geo.position']/@content").get() }) article.update({'organization': 'Báo mới'}) link = response.url article.update({'title': title, 'link': link}) # author, content, word_count content = '' author = '' for text in response.xpath( '(//div[@id="ArticleContent"]/p[@class="t-j"]/span/text())|(//div[@class="article__body"]/p[' '@class="body-text body-author"]/strong/text())|(//p[@class="body-text body-author"]/strong/text())' ).getall(): author += text.strip() article.update({'author': author}) for text in response.xpath( '(//div[@id="ArticleContent"]/p[@class="t-j"]/text())|(//div[@class="article__body"]/p[' '@class="body-text"]/text())|(//div[@class="article__sapo"]/text())' ).getall(): content += text.strip() article.update({'content_article': content}) word_count = len(content.split()) article.update({'word_count': word_count}) # get image thumbnail = response.xpath( '//p[@class="body-image"]/img/@src').getall() article.update({'thumbnail': thumbnail}) # get related_url relate_url = [] htags = response.xpath( '//div[@data-track="detail|related"]/div/h4') for tag in htags: relate_urls = {} headline = tag.xpath('a/@title').get() url = str(tag.xpath('a/@href').extract_first()) relate_urls.update({'headline': headline, 'url': url}) relate_url.append(relate_urls) article.update({"related_url": relate_url}) self.logger.info("#%d: Scraping %s", self.articleCount, article.get('link')) self.articleCount += 1 yield article else: pass
def parse_item(self, response): article = dict() date = dict() title = response.xpath( '//head/meta[@property="og:title"]/@content').extract_first() if title is not None: date.update({ 'datePublished': response.xpath( '//meta[@property="article:published_time"]/@content').get( ) }) date.update({ 'dateModified': response.xpath( '//meta[@property="article:modified_time"]/@content').get( ) }) if date is not None: try: date = time.timestamp_converter(date) article.update(date) except: pass link = response.url article.update({'title': title, 'link': link}) # get meta article.update({ 'headline': response.xpath('//meta[@itemprop="headline"]/@content').get() }) article.update({ 'type': response.xpath("//meta[@property='og:type']/@content").get() }) article.update({ 'description': response.xpath("//meta[@name='description']/@content").get() }) article.update({ 'keywords': response.xpath("//meta[@name='keywords']/@content").get() }) article.update({ 'category': response.xpath( "//meta[@property='article:section']/@content").get() }) article.update({ 'copyright': response.xpath("//meta[@name='copyright']/@content").get() }) article.update({ 'language': response.xpath("//meta[@name='Language']/@content").get() }) article.update({ 'geo_place_name': response.xpath( "//meta[@name = 'geo.placename']/@content").get() }) article.update({ 'geo_region': response.xpath("//meta[@name = 'geo.region']/@content").get() }) article.update({ 'geo_position': response.xpath( "//meta[@name = 'geo.position']/@content").get() }) article.update({'organization': 'Tuổi trẻ'}) # author content = '' author = '' for text in response.xpath( '(//div|//p)[contains(@class, "author") or contains(@class, "author_single") or contains(@class,"authorvideo") or contains(@class,"credit-text")]//text()' ).getall(): author += text.strip() article.update({'author': author}) for text in response.xpath( '//div[contains(@id,"main-detail-body") or contains(@class,"sp-detail-content") or contains(@class,"fck")]/p//text()' ).getall(): content += text.strip() article.update({'content_article': content}) word_count = len(content.split()) article.update({'word_count': word_count}) # get thumbnail thumbnail = response.xpath( '(//div[@type="Photo"]/div/a/img/@src)|(//div[@type="Photo"]/div/img/@src)|(//td/a/img/@src)' ).getall() article.update({'thumbnail': thumbnail}) # get images images = [] image = dict() image.update({ 'url': response.xpath('//meta[@property="og:image"]/@content').get() }) image.update({ 'alt': response.xpath( '//meta[@property="og:image:alt"]/@content').get() }) image.update({ 'width': response.xpath( '//meta[@property="og:image:width"]/@content').get() }) image.update({ 'height': response.xpath( '//meta[@property="og:image:height"]/@content').get() }) images.append(image) article.update({'image': images}) # get relate_url relate_url = [] htags = response.xpath( '//ul[@class="list-news"]/li/div[@class="name-title"]') for tag in htags: relate_urls = {} headline = tag.xpath('a/text()').get() url = "https://tuoitre.vn" + str( tag.xpath('a/@href').extract_first()) relate_urls.update({'headline': headline, 'url': url}) relate_url.append(relate_urls) article.update({"related_url": relate_url}) # get inf cmt objectid = response.xpath( '//div[@id="tagandnetwork"]/div[@class="tagandtopicandbanner"]/section/@data-objectid' ).get() if objectid is None: return 0 else: objectid = objectid datasort = response.xpath( '//div[@id="tagandnetwork"]/div[@class="tagandtopicandbanner"]/section/@data-sort' ).get() if datasort is None: return 0 else: datasort = datasort pagesize = response.xpath( '//div[@id="tagandnetwork"]/div[@class="tagandtopicandbanner"]/section/@data-pagesize' ).get() if pagesize is None: return 0 else: pagesize = pagesize objecttype = response.xpath( '//div[@id="tagandnetwork"]/div[@class="tagandtopicandbanner"]/section/@data-objecttype' ).get() if objecttype is None: return 0 else: objecttype = objecttype id_article = dict() id_article.update({ 'objectid': objectid, 'datasort': datasort, 'pagesize': pagesize, 'objecttype': objecttype }) # get total likes total_like = "https://s1.tuoitre.vn/count-object.htm?newsId=" + objectid yield scrapy.Request(total_like, callback=self.parse_like, headers={ 'Accept': '*/*', 'Origin': 'https://tuoitre.vn', 'Referer': response.url, 'Sec-Fetch-Mode': 'cors', }, meta={ 'article': article, 'id_article': id_article })
def parse_article(self, response): article = {} try: ld_json = response.xpath( "//script[contains(text(),'NewsArticle')]/text()").get() ld_json_dict = json.loads(ld_json) ld_json_dict = time.timestamp_converter(ld_json_dict) article.update(ld_json_dict) except: pass # get meta elems = { 'meta-description': response.xpath("//meta[@name='description']/@content").get(), 'meta-keywords': response.xpath("//meta[@name='keywords']/@content").get(), 'meta-title': response.xpath("//meta[@name='title']/@content").get(), 'meta-copyright': response.xpath("//meta[@name='copyright']/@content").get(), 'meta-author': response.xpath("//meta[@name='author']/@content").get(), 'language': response.xpath('//meta[@http-equiv = "content-language"]/@content').get(), 'geo.placename': response.xpath('//meta[@name = "geo.placename"]/@content').get(), 'geo.position': response.xpath('//meta[@name = "geo.region"]/@content').get(), 'geo.region': response.xpath('//meta[@name = "geo.region"]/@content').get(), 'meta-article:author': response.xpath("//meta[@property='article:author']/@content").get(), 'meta-article:publisher': response.xpath("//meta[@property='article:publisher']/@content").get(), 'category': response.xpath('//li[@class = "kmli active"]/a/text()').get(), 'organization': 'kênh 14', 'related_urls': response.xpath('//div[@class = "kds-same-category clearfix"]//div[@class = "rowccm"]/li/a/@href').getall(), 'url': response.url } article.update(elems) # get content content = '' for text in response.xpath('//div[@class = "knc-content"]//p//text()').getall(): content += text.strip() article.update({'content': content}) word_count = len(content.split()) article.update({'word_count': word_count}) # get image url images = {} for index, src in enumerate(response.xpath('//div[@class = "knc-content"]//div[@type = "Photo"]//@src').getall(), 1): images.update({'image' + str(index): src}) article.update({'image-urls': images}) # get video url videos = {} for index, src in enumerate(response.xpath('//div[@type="VideoStream"]/@data-src').getall(), 1): videos.update({'video'+str(index): src}) article.update({'video-urls': videos}) # get hashtags hashtags = {} for index, href in enumerate(response.xpath('//ul[@class="knt-list"]/li//@href').getall(), 1): hashtags.update({'tag'+str(index): href}) article.update({'hash-tags': hashtags}) comments_paras = response.xpath( '//script[@type="text/javascript"][contains(text(),"comment")]/text()').get() pv0 = comments_paras.find("MINGID_IFRAME_FUNC.mingidGenIfram") pv1 = comments_paras.find("(", pv0) pv2 = comments_paras.find(")", pv1)+1 paras = comments_paras[pv1:pv2] # danh sach parameters de lay request comment para_list = ast.literal_eval(paras) para_list = list(para_list) # get interactions inter_request = "https://sharefb.cnnd.vn/?urls=" + response.url yield scrapy.Request(inter_request, callback=self.get_inter, meta={'article': article, 'paras': para_list}, headers={ 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Origin': 'https://soha.vn', 'Referer': 'https://soha.vn/chiu-suc-ep-khong-lo-tu-my-tq-ngam-ngui-buong-tay-bo-roi-du-an-dau-mo-5-ti-usd-voi-doi-tac-lau-nam-20191007161429421.htm', 'Sec-Fetch-Mode': 'cors', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36' })
def parse_article(self, response): article = {} # get ld_json try: ld_json = response.xpath( '//script[contains(text(),"Article")]/text()').get() ld_json_dict = json.loads(ld_json) ld_json_dict = time.timestamp_converter(ld_json_dict) article.update(ld_json_dict) except: pass # get meta elems = { 'meta-description': response.xpath("//meta[@name='description']/@content").get(), 'meta-keywords': response.xpath("//meta[@name='keywords']/@content").get(), 'meta-title': response.xpath("//meta[@name='title']/@content").get(), 'meta-copyright': response.xpath("//meta[@name='copyright']/@content").get(), 'meta-author': response.xpath("//meta[@name='author']/@content").get(), 'language': response.xpath('//meta[@http-equiv = "content-language"]/@content').get(), 'geo.placename': response.xpath('//meta[@name = "geo.placename"]/@content').get(), 'geo.position': response.xpath('//meta[@name = "geo.region"]/@content').get(), 'geo.region': response.xpath('//meta[@name = "geo.region"]/@content').get(), 'meta-article:author': response.xpath("//meta[@property='article:author']/@content").get(), 'meta-article:publisher': response.xpath("//meta[@property='article:publisher']/@content").get(), 'organization': 'techtalk', 'url': response.url, # 'related_urls': response.xpath('//div[@class = "article-oldnew"]//div/div[@class = "article-oldnew-img"]/a/@href').getall() } article.update(elems) try: article.update({'category': response.xpath( '//a[@class = "entry-crumb"]')[1].xpath('./span/text()').get()}) except: pass # get content content = '' for text in response.xpath('//div[@class = "td-post-content"]//p/text()').getall(): content += text.strip() article.update({'content': content}) word_count = len(content.split()) article.update({'word_count': word_count}) # get image url images = {} for index, src in enumerate(response.xpath('//div[@class="td-post-content"]//*[contains(@class,"image") or contains(@class,"Image")]//@src').getall(), 1): images.update({'image' + str(index): src}) article.update({'image-urls': images}) # get video url videos = {} for index, src in enumerate(response.xpath('//div[@class="td-post-content"]//iframe/@src').getall(), 1): videos.update({'video' + str(index): src}) article.update({'video urls': videos}) # get hashtags hashtags = {} for index, href in enumerate(response.xpath('//ul[@class = "td-tags td-post-small-box clearfix"]//@href').getall(), 1): hashtags.update({'tag'+str(index): href}) article.update({'hash-tags': hashtags}) # get views views = response.xpath('//div[@class="td-post-views"]//text()').get() article.update({'views': views}) # get likes like_request = "https://www.facebook.com/plugins/like.php?href="+response.url + \ "&layout=button_count&show_faces=false&width=105&action=like&colorscheme=light&height=21" yield scrapy.Request(like_request, callback=self.parse_likes, meta={'article': article, 'url': response.url})
def parse_item(self, response): article = dict() image = dict() images = [] try: ld_json = response.xpath( '//script[contains(text(),"NewsArticle")]/text()').get() if ld_json is None: return 0 else: ld_json = ld_json ld_json = json.loads(ld_json) ld_json = time.timestamp_converter(ld_json) article.update(ld_json) except ValueError: return 0 title = response.xpath('//meta[@property="og:title"]/@content').get() link = response.url article.update({'title': title, 'link': link}) # get meta article.update({ 'type': response.xpath("//head/meta[@property='og:type']/@content").get() }) article.update({ 'description': response.xpath("//head/meta[@name='description']/@content").get() }) article.update({ 'keywords': response.xpath("//meta[@name='keywords']/@content").get() }) article.update({ 'category': response.xpath( "//meta[@property='article:section']/@content").get() }) article.update({ 'copyright': response.xpath("//meta[@name='copyright']/@content").get() }) article.update({ 'language': response.xpath("//meta[@name='Language']/@content").get() }) article.update({ 'geo_place_name': response.xpath("//meta[@name = 'geo.placename']/@content").get() }) article.update({ 'geo_region': response.xpath("//meta[@name = 'geo.region']/@content").get() }) article.update({ 'geo_position': response.xpath("//meta[@name = 'geo.position']/@content").get() }) article.update({'organization': 'Afamily'}) # author, content, title content = '' title = response.xpath( '//div[@class="w700 mr-40 fl"]/h1/text()').getall() article.update({'title': title}) for text in response.xpath( '(//div[@id="af-detail-content"]/p/text())|(//div[@data-role="content"]/div/span/text())|(//p[' '@class="MsoNormal"]/text())|(//*[@id="af-detail-content"]/div/div/div/text())|(//*[' '@id="af-detail-content"]/div/div/div/span/text())|(//*[@id="af-detail-content"]/div/div/p/text())' ).getall(): content += text.strip() article.update({'content_article': content}) if content is not None: word_count = len(content.split()) article.update({'word_count': word_count}) else: word_count = -1 article.update({'word_count': word_count}) url_image = response.xpath( '//meta[@property="og:image"]/@content').get() if url_image is not None: image.update({ 'url': response.xpath('//meta[@property="og:image"]/@content').get() }) image.update({ 'alt': response.xpath( '//meta[@property="og:image:alt"]/@content').get() }) image.update({ 'width': response.xpath( '//meta[@property="og:image:width"]/@content').get() }) image.update({ 'height': response.xpath( '//meta[@property="og:image:height"]/@content').get() }) images.append(image) article.update({'image': images}) # get thumbnail thumbnail = response.xpath( '(//div[@class="VCSortableInPreviewMode LayoutAlbumWrapper alignJustify noCaption"]/div/div/div/figure/a/@href)|(//div[@type="Photo"]/div/a/img/@src)|(//figure[@type="Photo"]/div/a/img/@src)|(//a[@class="detail-img-lightbox"]/img/@src)' ).getall() article.update({'thumbnail': thumbnail}) with open("body.html", "wb") as f: f.write(response.body) # get likes,comments yield scrapy.Request( 'http://sharefb.cnnd.vn/?urls=' + response.url, callback=self.parse_interations, headers={ 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Origin': 'https://afamily.vn', 'Sec-Fetch-Mode': 'cors', 'Referer': article.get('link') }, meta={'article': article})
def parse_article(self, response): article = {} # get ld_json try: ld_json = response.xpath( "//script[contains(text(),'NewsArticle')]/text()").get() ld_json_dict = json.loads(ld_json)[0] ld_json_dict = time.timestamp_converter(ld_json_dict) article.update(ld_json_dict) except: pass # get meta elements elems = { 'meta-description': response.xpath("//meta[@name='description']/@content").get(), 'meta-keywords': response.xpath("//meta[@name='keywords']/@content").get(), 'meta-title': response.xpath("//meta[@name='title']/@content").get(), 'meta-copyright': response.xpath("//meta[@name='copyright']/@content").get(), 'meta-author': response.xpath("//meta[@name='author']/@content").get(), 'language': response.xpath( '//meta[@http-equiv = "content-language"]/@content').get(), 'geo.placename': response.xpath('//meta[@name = "geo.placename"]/@content').get(), 'geo.position': response.xpath('//meta[@name = "geo.region"]/@content').get(), 'geo.region': response.xpath('//meta[@name = "geo.region"]/@content').get(), 'meta-article:author': response.xpath( "//meta[@property='article:author']/@content").get(), 'meta-article:publisher': response.xpath( "//meta[@property='article:publisher']/@content").get(), 'category': response.xpath('//h2[@class = "headline"]/a/text()').get(), 'organization': 'thanh niên', 'url': response.url, # 'related_urls': response.xpath('//div[@class = "article-oldnew"]//div/div[@class = "article-oldnew-img"]/a/@href').getall() } article.update(elems) # get video url videos = [] try: url_finder = response.xpath( '//figure[@itemprop = "associatedMedia"]/script/text()').get() pv1 = url_finder.find("src") pv2 = url_finder[pv1:].find('"') + pv1 + 1 pv3 = url_finder[pv2:].find('"') + pv2 video_url = url_finder[pv2:pv3] videos.append(video_url) except: pass video_url = response.xpath( '//table[@class="video"]//@data-video-src').get() videos.append(video_url) article.update({'videos-url': videos}) # get content content = '' for text in response.xpath( '//div[@id="abody"]//p[contains(@style,"margin")or contains(@style,"text")]/text()' ).getall(): content += text.strip() for text in response.xpath('//*[@id="abody"]//div/text()').getall(): content += text.strip() article.update({'content': content}) word_count = len(content.split()) article.update({'word_count': word_count}) # get image url images = {} ava_index = 0 for ava_index, src in enumerate( response.xpath( '//*[@id="contentAvatar"]//a/img/@src').getall(), 1): images.update({'image' + str(ava_index): src}) index = ava_index + 1 for index, src in enumerate( response.xpath('//*[@class="imagefull"]//@data-src').getall(), index): images.update({'image' + str(index): src}) article.update({'image-urls': images}) # get comments comments_count = response.xpath('//*[@id="commentcount"]/text()').get() article.update({'comments-count': comments_count}) comments = [] for comment in response.xpath('//*[@id="commentcontainer"]/div'): primary_comment = comment.xpath( './div[@class = "primary-comment"]') primary_ava = primary_comment.xpath( './/div[@class = "ava"]/img/@data-src').get() primary_user = primary_comment.xpath( './/div[@class = "data"]/div[@class = "meta"]/h4/text()').get( ) if primary_user is not None: primary_user = primary_user.strip() primary_geo = primary_comment.xpath( './/div[@class = "data"]/div[@class = "meta"]/time/text()' ).get() if primary_geo is not None: primary_geo = primary_geo.strip() primary_content = primary_comment.xpath( './/div[@class = "data"]/div[@class = "comment"]/text()').get( ) if primary_content is not None: primary_content = primary_content.strip() primary_time = primary_comment.xpath( './/div[@class = "meta"]/time/@rel').get() primary_likes = primary_comment.xpath( './/div[@class = "data"]/div[@class = "reply"]//a[@class = "likebtn"]//text()' ).get() if primary_likes is not None: primary_likes = primary_likes.strip() strings = [s for s in primary_likes.split() if s.isdigit()] if len(strings) != 0: primary_likes = strings[0] else: primary_likes = '0' secondary_dict = [] counter = 0 for counter, reply in enumerate( comment.xpath('.//div[@class = "secondary-comment"]'), 1): secondary_ava = reply.xpath( './/div[@class = "ava"]/img/@data-src').get() secondary_user = reply.xpath( './/div[@class = "data"]/div[@class = "meta"]/h4/text()' ).get() if secondary_user is not None: secondary_user = secondary_user.strip() secondary_geo = reply.xpath( './/div[@class = "data"]/div[@class = "meta"]/time/text()' ).get() if secondary_geo is not None: secondary_geo = secondary_geo.strip() secondary_content = reply.xpath( './/div[@class = "data"]/div[@class = "comment"]/text()' ).get() if secondary_content is not None: secondary_content = secondary_content.strip() secondary_time = reply.xpath( './/div[@class = "meta"]/time/@rel').get() secondary_likes = reply.xpath( './/div[@class = "data"]/div[@class = "reply"]//a[@class = "likebtn"]//text()' ).get() if secondary_likes is not None: secondary_likes = secondary_likes.strip() strings = [ s for s in secondary_likes.split() if s.isdigit() ] if len(strings) != 0: secondary_likes = strings[0] else: secondary_likes = '0' secondary_dict.append({ 'SenderAvatar': secondary_ava, 'SenderFullName': secondary_user, 'PublishedGeo': secondary_geo, 'CommentContent': secondary_content, 'CreatedDate': secondary_time, 'Liked': secondary_likes, 'Replies-count': 0, 'Replies': [] }) comments.append({ 'SenderAvatar': primary_ava, 'SenderFullName': primary_user, 'PublishedGeo': primary_geo, 'CommentContent': primary_content, 'CreatedDate': primary_time, 'Liked': primary_likes, 'Replies-count': counter, 'Replies': secondary_dict if counter != 0 else None }) article.update({'comments': comments}) # get likes url = response.xpath( '//li[@class = "zalo-share-button"]/@data-href').get() if url is None: url = response.xpath('//li[@class="fb-share"]/a/@href').get() url = url.replace("=", "%3D") url = url.replace("/", "%2F") url = url.replace(":", "%3A") like_request = "https://www.facebook.com/v3.1/plugins/like.php?action=like&app_id=288067561729014&channel=https%3A%2F%2Fstaticxx.facebook.com%2Fconnect%2Fxd_arbiter.php%3Fversion%3D44%23cb%3Df1b1dac16a53484%26domain%3Dthanhnien.vn%26origin%3Dhttps%253A%252F%252Fthanhnien.vn%252Ff20b42488425504%26relation%3Dparent.parent&container_width=0&href=" + \ url+"&layout=button_count&locale=en_US&sdk=joey&share=true&show_faces=false&size=large" yield scrapy.Request(like_request, callback=self.parse_likes, meta={'article': article})
def parse_item(self, response): article = dict() title_arr = response.xpath('//h1[@class="title"]/text()').get() if title_arr is not None: title = title_arr.strip() # get ld_json try: ld_json = response.xpath('//script[contains(text(),"NewsArticle")]/text()').get() ld_json = ld_json ld_json = json.loads(ld_json) ld_json = time.timestamp_converter(ld_json) article.update(ld_json) except: pass # get headline article.update({'headline': response.xpath("//meta[@itemprop='headline']/@content").get()}) # get thumbnail image_list = response.xpath('//div/img/@src').getall() image_str = str(image_list) article.update({'thumbnail': image_str}) # get meta article.update({'type': response.xpath("//head/meta[@property='og:type']/@content").get()}) article.update({'description': response.xpath("//meta[@name='description']/@content").get()}) article.update({'keywords': response.xpath("//meta[@name='keywords']/@content").get()}) article.update({'category': response.xpath("//meta[@property='article:section']/@content").get()}) article.update({'copyright': response.xpath("//meta[@name='copyright']/@content").get()}) article.update({'author': response.xpath("//meta[@name='author']/@content").get()}) article.update({'language': response.xpath("//meta[@name='Language']/@content").get()}) article.update({'geo_place_name': response.xpath("//meta[@name = 'geo.placename']/@content").get()}) article.update({'geo_region': response.xpath("//meta[@name = 'geo.region']/@content").get()}) article.update({'geo_position': response.xpath("//meta[@name = 'geo.position']/@content").get()}) article.update({'organization': 'Cafef'}) # get title, link link = response.url article.update({'title': title, 'link': link}) article.update({'author': response.xpath("//p[@class='author']/text()").get()}) # get contents content = '' for text in response.xpath( '(//div[@class="contentdetail"]/span/p/text())|(//div[@class="companyIntro"]/text())').getall(): content += text.strip() article.update({'content_article': content}) if content is not None: word_count = len(content.split()) article.update({'word_count': word_count}) else: word_count = -1 article.update({'word_count': word_count}) # get likes,comments yield scrapy.Request("https://sharefb.cnnd.vn/?urls=" + response.url, callback=self.parse_interactions, headers={'Accept': 'application/json, text/javascript, */*; q=0.01', 'Origin': 'https://cafef.vn', 'Referer': response.url, 'Sec-Fetch-Mode': 'cors', }, meta={'article': article}) # get relate_url relate_url = [] htags = response.xpath('//div[@class="bg-tit-samecate"]/h4') for tag in htags: relate_urls = {} headline = tag.xpath('a/@title').get() url = "https://cafef.vn" + str(tag.xpath('a/@href').extract_first()) relate_urls.update({'headline': headline, 'url': url}) relate_url.append(relate_urls) article.update({"related_url": str(relate_url)})
def parse_article(self, response): article = {} # get ld_json try: ld_json = response.xpath( "//script[contains(text(),'Article')]/text()").get() if (ld_json is None): ld_json = response.xpath( "//script[contains(text(),'NewsArticle')]/text()").get() ld_json_dict = json.loads(ld_json) ld_json_dict = time.timestamp_converter(ld_json_dict) article.update(ld_json_dict) except: pass # get meta elems = { 'meta-description': response.xpath("//meta[@name='description']/@content").get(), 'meta-keywords': response.xpath("//meta[@name='keywords']/@content").get(), 'meta-title': response.xpath("//meta[@name='title']/@content").get(), 'meta-copyright': response.xpath("//meta[@name='copyright']/@content").get(), 'meta-author': response.xpath("//meta[@name='author']/@content").get(), 'meta-content-language': response.xpath( '//meta[@name = "content-language"]/@content').get(), 'meta-geo.placename': response.xpath('//meta[@name = "geo.placename"]/@content').get(), 'meta-geo.position': response.xpath('//meta[@name = "geo.region"]/@content').get(), 'meta-geo.region': response.xpath('//meta[@name = "geo.region"]/@content').get(), 'meta-article:author': response.xpath( "//meta[@property='article:author']/@content").get(), 'meta-article:publisher': response.xpath( "//meta[@property='article:publisher']/@content").get(), 'url': response.url, 'category': 'viblo article', 'organization': 'viblo', 'related-urls': response.xpath( '//div[@class = "related-posts-box"]//div[contains(@class, "post-card__title")]//a/@href' ).getall() } article.update(elems) # get hashtags article.update({'hash-tags': response.meta['hash-tags']}) # get views views = response.xpath( '//div[contains(@data-original-title, "Views:")]/@data-original-title' ).get() if views is not None: strings = [s for s in views.split() if s.isdigit()] if len(strings) != 0: views = strings[0] else: views = '0' article.update({'view-count': views}) # get likes/ upvotes counts likes = response.xpath( '//div[@class = "votes votes--side post-actions__vote mb-1"]/div/text()' ).get() if likes is not None: likes = likes.replace('+', '') likes = likes.replace('\n', '') likes = likes.strip() article.update({'likes-counter': likes}) # get comments count comment_count = response.xpath( '//div[@class = "post-meta__item mr-1"]//button[@class = "el-button el-button--text"]/span/text()' ).get() if comment_count is not None: comment_count = comment_count.replace('\n', '').strip() article.update({'comments-count': comment_count}) else: article.update({'comments-count': '0'}) # get content content = '' for text in response.xpath( '//div[contains(@class, "md-contents article-content__body")]//text()' ).getall(): content += text.strip() article.update({'content': content}) word_count = len(content.split()) article.update({'word_count': word_count}) # get image url images = {} for index, src in enumerate( response.xpath( '//div[contains(@class, "md-contents article-content__body")]//img/@src' ).getall(), 1): images.update({'image' + str(index): src}) article.update({'image-urls': images}) # get comments id = response.url.split('-') id = id[len(id) - 1] comment_url = "https://viblo.asia/api/posts/" + id + "/comments" return scrapy.Request(comment_url, callback=self.parse_comments, meta={'article': article})
def parse_article(self, response): article = {} try: # get ld_json ld_json = response.xpath( '//html/head/script[contains(text(),"NewsArticle")]/text()' ).get() ld_json = remove_ctrl(ld_json) ld_json_dict = json.loads(ld_json) ld_json_dict = time.timestamp_converter(ld_json_dict) article.update(ld_json_dict) except: pass # get meta elems = { 'meta-description': response.xpath("//meta[@name='description']/@content").get(), 'meta-keywords': response.xpath("//meta[@name='keywords']/@content").get(), 'meta-title': response.xpath("//meta[@name='title']/@content").get(), 'meta-copyright': response.xpath("//meta[@name='copyright']/@content").get(), 'meta-author': response.xpath("//meta[@name='author']/@content").get(), 'language': response.xpath( '//meta[@http-equiv = "content-language"]/@content').get(), 'geo.placename': response.xpath('//meta[@name = "geo.placename"]/@content').get(), 'geo.position': response.xpath('//meta[@name = "geo.region"]/@content').get(), 'geo.region': response.xpath('//meta[@name = "geo.region"]/@content').get(), 'meta-article:author': response.xpath( "//meta[@property='article:author']/@content").get(), 'meta-article:publisher': response.xpath( "//meta[@property='article:publisher']/@content").get(), 'category': response.xpath( '//li[@class = "f-rsb m-auto nav-item position-relative d-inline-block active"]/a/text()' ).get(), 'organization': 'người đưa tin', 'url': response.url, 'related_urls': response.xpath( '//section[@class = "article-content clearfix"]/following-sibling::section[@class = "row"]//li[@class = "box-news row pb-3 clearfix py-3 border-bottom "]/a/@href' ).getall() } article.update(elems) # get content content = '' for text in response.xpath( '/html/body//section[@class = "article-content clearfix"]/article//text()' ).getall(): content += text.strip() for text in response.xpath( '//div[@class = "box-center"]/p/text()').getall(): content += text.strip() article.update({'content': content}) word_count = len(content.split()) article.update({'word_count': word_count}) # get image url images = {} type1_index = 0 for type1_index, src in enumerate( response.xpath( '/html/body//section[@class = "article-content clearfix"]//figure[@class = "tplCaption image"]/img/@src' ).getall(), 1): images.update({'image' + str(type1_index): src}) type2_index = type1_index + 1 for type2_index, src in enumerate( response.xpath( '//*[contains(@class,"image-full-width") or contains(@class,"box")]/img/@src' ).getall(), type2_index): images.update({'image' + str(type2_index): src}) article.update({'image-urls': images}) url = response.url url = url.replace('https://www.nguoiduatin.vn/', '') id = response.xpath('//@data-id').get() if id is None: pv1 = response.url.find('.html') pv2 = response.url.find('a', pv1 - 7) + 1 id = response.url[pv2:pv1] # get video urls id_finder = response.xpath( '//script[contains(@src,"//embed.easyvideo.vn/play")]/@src').get() if id_finder is not None: easyvideo_id = id_finder.replace('//embed.easyvideo.vn/play', '') video_finder = "https://embed.easyvideo.vn/render/" + \ easyvideo_id+"?targetId=MeCloudLoader_"+easyvideo_id yield scrapy.Request(video_finder, callback=self.parse_video, meta={ 'article': article, 'url': url, 'id': id }) else: # get likes like_request = "https://www.facebook.com/v2.9/plugins/like.php?action=like&app_id=1069396303196363&channel=https%3A%2F%2Fstaticxx.facebook.com%2Fconnect%2Fxd_arbiter.php%3Fversion%3D44%23cb%3Df122fdd10517174%26domain%3Dwww.nguoiduatin.vn%26origin%3Dhttps%253A%252F%252Fwww.nguoiduatin.vn%252Ff3f7ea1e941e5e4%26relation%3Dparent.parent&container_width=410&href=https%3A%2F%2Fwww.nguoiduatin.vn%2F" + url + "&layout=button_count&locale=vi_VN&sdk=joey&share=true&size=small" yield scrapy.Request(like_request, callback=self.parse_likes, meta={ 'article': article, 'id': id })
def parse_item(self, response): article = dict() image = dict() images = [] title = response.xpath('//div[@class="head-article"]/h1/@data-title').get() if title is not None: # get meta article.update({'headline': response.xpath('//meta[@itemprop="headline"]/@content').get()}) article.update({'datePublished': response.xpath('//time[@itemprop="datePublished"]/@datetime').get()}) article.update({'dateModified': response.xpath('//time[@itemprop="dateModified"]/@datetime').get()}) article.update({'publisher': response.xpath('//div[@itemprop="publisher"]/span/text()').get()}) article.update({'type': response.xpath("//head/meta[@property='og:type']/@content").get()}) article.update({'description': response.xpath("//head/meta[@name='description']/@content").get()}) article.update({'keywords': response.xpath("//head/meta[@name='keywords']/@content").get()}) article.update({'category': response.xpath("//head/meta[@property='article:section']/@content").get()}) article.update({'copyright': response.xpath("//head/meta[@name='copyright']/@content").get()}) article.update({'Language': response.xpath("//head/meta[@name='Language']/@content").get()}) article.update({'geo_place_name': response.xpath("//meta[@name = 'geo.placename']/@content").get()}) article.update({'geo_region': response.xpath("//meta[@name = 'geo.region']/@content").get()}) article.update({'geo_position': response.xpath("//meta[@name = 'geo.position']/@content").get()}) article.update({'organization': 'Saostar'}) article = time.timestamp_converter(article) url_img = response.xpath('//meta[@property="og:image"]/@content').get() if url_img is not None: image.update({'url': response.xpath('//meta[@property="og:image"]/@content').get()}) image.update({'alt': response.xpath('//meta[@property="og:image:alt"]/@content').get()}) image.update({'width': response.xpath('//meta[@property="og:image:width"]/@content').get()}) image.update({'height': response.xpath('//meta[@property="og:image:height"]/@content').get()}) images.append(image) article.update({'image': images}) # title, link, author, content link = response.url article.update({'title': title, 'link': link}) article.update({'author': response.xpath("//span[@class='writer']/text()").get()}) content = '' for text in response.xpath('(//div[@id="content_detail"]/p/text())|' '(//span[' '@class="wp-caption-text"]/text())').getall(): content += text.strip() article.update({'content_article': content}) if content is not None: word_count = len(content.split()) article.update({'word_count': word_count}) else: word_count = -1 article.update({'word_count': word_count}) # get image thumbnail = response.xpath('(//p/a/img/@src)|(//strong/a/img/@src)|(//div/a/img/@src)').getall() if thumbnail is not []: article.update({'thumbnail': thumbnail}) # get relate_url relate_url = [] htags = response.xpath( '(//div[@class="content-block"]/div[@class="post mt15 js-post "]/h4[@class="post-title pl15 dis-inline-block"])|(//h3[@class="post-title mb10"])') for tag in htags: relate_urls = {} headline = tag.xpath('a/text()').get() if headline is not []: url = str(tag.xpath('a/@href').extract_first()) relate_urls.update({'headline': headline, 'url': url}) relate_url.append(relate_urls) article.update({"related_url": relate_url}) # get interactions url = response.xpath('//meta[@itemprop="url"]/@content').get() like_request = "https://www.facebook.com/v2.8/plugins/like.php?action=like&channel=https%3A%2F%2Fstaticxx" \ ".facebook.com%2Fconnect%2Fxd_arbiter.php%3Fversion%3D44%23cb%3Df37cc7337bc398%26domain" \ "%3Dsaostar.vn%26origin%3Dhttps%253A%252F%252Fsaostar.vn%252Ff3ecd646e17999%26relation" \ "%3Dparent.parent&container_width=0&href=" + url \ + "&layout=button_count&locale=vi_VN&sdk=joey&share=true&show_faces=false" yield scrapy.Request(like_request, callback=self.parse_like, meta={'data': article}) else: pass
def parse_item(self, response): article = dict() title = response.xpath( '(//h1[@class="title_detail"]/text())|(//div[@class="infomationdetail clearfix"]/h1/text())' ).get() if title is not None: # get ld_json ld_json = response.xpath( '//head/script[@type="application/ld+json"]/text()').get() if ld_json is not None: try: ld_json = json.loads(ld_json) ld_json = time.timestamp_converter(ld_json) article.update(ld_json) except ValueError: pass if 'dateModified' in article.keys(): dateModified = response.xpath( '//meta[@name="pubdate"]/@content').get() article.update( {'dateModified': time.Vnex_timestamp(dateModified)}) if 'datePublished' in article.keys(): datePublished = response.xpath( '//meta[@name="lastmod"]/@content').get() article.update( {'datePublished': time.Vnex_timestamp(datePublished)}) # get meta article.update({ 'type': response.xpath( "//head/meta[@property='og:type']/@content").get() }) article.update({ 'description': response.xpath( "//head/meta[@name='description']/@content").get() }) article.update({ 'keywords': response.xpath("//head/meta[@name='keywords']/@content").get() }) article.update({ 'category': response.xpath( "//head/meta[@property='article:section']/@content").get() }) article.update({ 'copyright': response.xpath( "//head/meta[@name='copyright']/@content").get() }) article.update({ 'language': response.xpath("//head/meta[@name='Language']/@content").get() }) article.update({ 'geo_place_name': response.xpath( "//meta[@name = 'geo.placename']/@content").get() }) article.update({ 'geo_region': response.xpath("//meta[@name = 'geo.region']/@content").get() }) article.update({ 'geo_position': response.xpath( "//meta[@name = 'geo.position']/@content").get() }) article.update({'organization': 'VTV'}) title = response.xpath( '//meta[@property="og:title"]/@content').get() link = response.url article.update({'title': title, 'link': link}) # author content = '' author = '' for text in response.xpath( '(//p[@class="news-info"]/b/text())|(//p[@class="author"]/text())' ).getall(): author += text.strip() article.update({'author': author}) for text in response.xpath( '(//div[@id="entry-body"]/p/text())|(//div[@class="w638 mgl96"]/div[@class="ta-justify"]/p/text())' ).getall(): content += text.strip() article.update({'content_article': content}) word_count = len(content.split()) article.update({'word_count': word_count}) # get image thumbnail = response.xpath( '(//div[@class="infomationdetail clearfix"]/img/@src)|(//div[@class="noidung"]/img/@src)|(//div[@type="Photo"]/div/img/@src)|(//figure[@class="LayoutAlbumItem"]/a/img/@src)' ).getall() if thumbnail is not None: article.update({'thumbnail': thumbnail}) else: article.update({'thumbnail': '-1'}) # get relate_url relate_url = [] htags = response.xpath('//div[@class="clearfix pdb20"]/ul/li') for tag in htags: relate_urls = {} headline = tag.xpath('a/@title').get() if headline is not []: url = "https://vtv.vn" + str( tag.xpath('a/@href').extract_first()) relate_urls.update({'headline': headline, 'url': url}) relate_url.append(relate_urls) article.update({"related_url": relate_url}) objectid = response.xpath( '//div[@class="aspNetHidden"]/input[@id="hdNewsId"]/@value' ).get() cmt_resquest = 'https://sharefb.cnnd.vn/?urls=http://vtv.vn/news-' + str( objectid) + '.htm' yield scrapy.Request( cmt_resquest, callback=self.parse_comment, headers={ 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Origin': 'https://vtv.vn', 'Sec-Fetch-Mode': 'cors', 'Referer': response.url }, meta={'article': article})