def parse_content(self, response): try: tmp = response.xpath( '//div[@class="bm cl"]/div[@class="z"]/a[@href]//text()' ).extract() if tmp: first = tmp[2].strip() second = tmp[3].strip() title = tmp[4].strip() #tmp = '\t'.join(response.xpath('//div[@class="wp cl"]//div[@class="pl c"]')[0].xpath('//blockquote/text()').extract()[0]) #tmp = response.xpath('//div[@class="pl bm"]//div[@class="pl hin"]//tbody').extract() tmp = filter_tags( response.xpath( '//div[@class="pl bm"]//table[@class="plhin"]//td[@class="t_f"]//blockquote' ).extract_first()).strip().replace('\n', '').replace('\xa0', ' ') if tmp: item = HuxiuItem() content = tmp.strip().split('\r') #print('============================') #print(content) item['first'] = first item['second'] = second item['title'] = title item['content'] = content item['url'] = str(response.request.url).strip() return item except Exception as err: print(tmp) print('Exception occurred', traceback.format_exc()) print(err) pass
def parse_article(self, response): detail = response.xpath('//div[@class="article__bottom-content__right fl"]') item = HuxiuItem() item['title'] = detail.xpath('div/h1/text()')[0].extract() item['link'] = response.url item['posttime'] = detail.xpath('div/span[@class="article__time"]/text()')[0].extract() print(item['title'], item['link'], item['posttime']) yield item
def parse(self, response): for sel in response.xpath('//div[@class="mod-info-flow"]/div/div[@class="mob-ctt"]'): item = HuxiuItem() item['title'] = sel.xpath('h3/a/text()')[0].extract() item['link'] = sel.xpath('h3/a/@href')[0].extract() url = response.urljoin(item['link']) item['desc'] = sel.xpath('div[@class="mob-sub"]/text()')[0].extract() # print(item['title'],item['link'],item['desc']) yield scrapy.Request(url, callback=self.parse_article)
def parse_article(self, response): detail = response.xpath('//div[@class="article-wrap"]') item = HuxiuItem() item['title'] = detail.xpath('h1/text()')[0].extract() item['link'] = response.url item['posttime'] = detail.xpath( 'div[@class="article-author"]/span[@class="article-time"]/text()')[0].extract() print(item['title'],item['link'],item['posttime']) yield item
def parse_article(self, response): detail = response.xpath('//div[@class="article-wrapper"]') #inspect_response(response, self) item = HuxiuItem() item['title'] = detail.xpath('div/section/div/div[1]/div[1]/h1/text()')[0].extract() item['link'] = response.url item['posttime'] = detail.xpath('div/section/div/div[1]/div[1]/span/text()')[0].extract() print(item['title'],item['link'],item['posttime']) yield item
def parse_item(self, response): self.logger.info('Hi, this is an item page! %s', response.url) detail = response.xpath('//div[@class="article-wrap"]') item = HuxiuItem() item['title'] = detail.xpath('h1/text()')[0].extract() item['link'] = response.url item['published'] = detail.xpath( 'div[@class="article-author"]/span[@class="article-time"]/text()')[0].extract() logging.info(item['title'],item['link'],item['published']) yield item
def parse(self, response): for sel in response.xpath('//div[@class="mob-ctt index-article-list-yh"]'): item = HuxiuItem() item['title'] = sel.xpath('h2/a/text()').extract_first() item['link'] = sel.xpath('h2/a/@href')[0].extract() # url = response.urljoin("https://www.huxiu.com"+item['link']) url = response.urljoin(item['link']) item['desc'] = sel.xpath('div[@class="mob-sub"]/text()')[0].extract() # print(item['title'], item['link'], url,item['desc']) yield scrapy.Request(url, callback=self.parse_article)
def parse_item(self, response): detail = response.xpath('//div[@class="article-wrap"]') item = HuxiuItem() item['link'] = response.url item['image_url'] = detail.xpath( 'div[@class="article-img-box"]/img/@src').extract_first() item['title'] = detail.xpath('h1/text()').extract_first() item['posttime'] = detail.xpath( 'div[@class="article-author"]/div[@class="column-link-box"]/span[@class="article-time pull-left"]/text()' ).extract_first() yield item
def parse_article(self, response): detail = response.xpath('//div[@class="article-wrap"]') item = HuxiuItem() item['title'] = detail.xpath('h1/text()')[0].extract().strip() item['link'] = response.url.strip() item['author'] = detail.xpath( 'div[@class="article-author"]/span[@class="author-name"]/a/text()' )[0].extract().strip() item['published'] = detail.xpath( 'div[@class="article-author"]/div[@class="column-link-box"]/span/text()' )[0].extract().strip() yield item
def parse_article(self, response): detail = response.xpath('//div[@class="article-wrap"]') item = HuxiuItem() item['title'] = detail.xpath('h1/text()')[0].extract() item['url'] = response.url item['body'] = response.url item['source_site'] = response.url item['published'] = detail.xpath( 'div[@class="article-author"]/span[@class="article-time"]/text()' )[0].extract() # logging.info(item['title'],item['link'],item['published']) yield item
def parse(self, response): for sel in response.xpath( '//div[@class="mod-info-flow"]/div/div[@class="mob-ctt"]'): item = HuxiuItem() item['title'] = sel.xpath('h2/a/text()')[0].extract() item['link'] = sel.xpath('h2/a/@href')[0].extract() sub = sel.xpath('div[contains(@class, "mob-sub")]') if len(sub.xpath('span')) > 0: item['desc'] = sub.xpath('span/text()')[0].extract() else: item['desc'] = sub.xpath('text()')[0].extract() url = response.urljoin(item['link']) yield scrapy.Request(url, callback=self.parse_article)
def parse_content(self, response): try: tmp = response.xpath( '//div[@class="bm cl"]/div/a[@href]//text()').extract() if tmp: first = tmp[1].strip() second = tmp[2].strip() title = tmp[3].strip() tmp = ','.join( response.xpath('//div[@class="pl bm"]/div')[0].xpath( '//td[@class="t_f"]/text()').extract()) if tmp: item = HuxiuItem() content = tmp.strip() item['first'] = first item['second'] = second item['title'] = title item['content'] = content.replace('\n', '').replace('\r', '') item['url'] = str(response.request.url).strip() yield item except Exception as err: pass