def parse(self): resp = rget(self.site_url) html = etree.HTML(resp.content) try: total_url = ''.join( html.xpath('//div[@id="pager"]/a[@class="last"]/@href')) pages = parse.parse_qs(parse.urlsplit(total_url).query)['page'][0] except: pages = 12 urls = self._construct_page_url(int(pages) + 1) details = [] for page_url in urls: resp = rget(page_url) if not resp: continue html = etree.HTML(resp.content) hrefs = html.xpath('//div[@class="art_cat_box"]/table//a/@href') for href in hrefs: try: item = self._extract(href, page_url) if not item: continue details.append(item) except IndexError: # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬 continue NewsPipeline().save(details)
def parse(self): resp = rget('http://meilibaobao.com/artlist-217.html') html = etree.HTML(resp.content) try: last_page = ''.join( html.xpath('//td[@class="pagernum"]/a[last()]/text()')) last_page = int(last_page) except: last_page = 180 pages = self._construct_pages(last_page + 1) details = [] for page_url in pages: resp = rget(page_url) if not resp: continue html = etree.HTML(resp.content) divs = html.xpath('//div[@id="columns"]/div') for dd in divs: try: href = ''.join(dd.xpath('./div[@class="pic"]/a/@href')) item = self._extract(href, page_url) if not item: continue details.append(item) except IndexError: # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬 continue NewsPipeline().save(details)
def parse(self): resp = rget('http://www.milanstand.com/article-zixun-1/') html = etree.HTML(resp.content) try: last_page = html.xpath( '//p[@class="nx"]/following-sibling::p/a/@href')[0] last_page = int(last_page.split('-')[-1][:-1]) except IndexError: last_page = 55 pages = self._construct_pages(last_page + 1) details = [] for page_url in pages: resp = rget(page_url) if not resp: continue html = etree.HTML(resp.content) divs = html.xpath( '//div[@class="box_3"]/table/tr//div[contains(text(), "包")]') for dd in divs: try: href = ''.join(dd.xpath('./a/@href')) href = urljoin(self.site_url, href) item = self._extract(href, page_url) if not item: continue details.append(item) except IndexError: # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬 continue NewsPipeline().save(details)
def _extract(self, href, referer): resp = rget(href, referer=referer) if not resp: return html = etree.HTML(resp.content) if len(html) is None: return info = html.xpath('//div[@class="info"]/text()') tag = info[-1] publish_time = info[0].split('\xa0')[0] author = ''.join(html.xpath('//div[@class="info"]/a/text()')) title = ''.join(html.xpath('//div[@class="article_con"]/h1/text()')) content = ''.join(html.xpath('//div[@class="art_con"]//text()')) if content: content = trim('。&&&'.join(content.split('。'))) else: content = ''.join(html.xpath('//div[@class="mcontent"]//text()')) if not content: return logger.debug( '\033[96m title:{}; href:{}; tag:{}; content:{} \033[0m'.format( title, href, tag, len(content))) return { 'category': '包包', 'site': self.site, 'tag': tag, 'news_url': href, 'title': title, 'content': content, 'author': author, 'publish_time': publish_time, }
def _extract(self, href, referer): resp = rget(href, referer=referer) if not resp: self._extract(href, referer=referer) html = etree.HTML(resp.content) if not html: return title = ''.join(html.xpath('//*[@id="activity-name"]/text()')) if title: title = trim(title) else: return publish_time = re.findall(r'publish_time = "(\d{4}-\d{2}-\d{2})"?', resp.text) publish_time = publish_time[0] if publish_time else '' author = trim(''.join(html.xpath('//*[@id="js_name"]/text()'))) content = ''.join(html.xpath('//*[@id="js_content"]//text()')) if content: content = trim('。&&&'.join(content.split('。'))) else: return logger.debug('\033[96m title:{}; href:{}; content:{} \033[0m'.format( title, href, len(content))) return { 'category': 'news', 'site': self.url, 'tag': -1, 'news_url': href, 'title': title, 'content': content, 'author': author, 'publish_time': publish_time, }
def parse(self): #时尚, 八卦, 旅游, 养生 categorys = [9, 4, 11, 2] purl = 'http://weixin.sogou.com/pcindex/pc/pc_{category}/{page}.html' for category in categorys: urls = [ purl.format(page=page, category=category) for page in range(1, 5) ] urls.insert( 0, 'http://weixin.sogou.com/pcindex/pc/pc_{category}/pc_{category}.html' .format(category=category)) for url in urls: resp = rget(url) if not resp: continue html = etree.HTML(resp.content) hrefs = html.xpath( '//ul[@id="pc_0_0"]//li/div[@class="txt-box"]/h3/a/@href') if not hrefs: hrefs = html.xpath('//li/div[@class="img-box"]/a/@href') if not hrefs: import pdb pdb.set_trace() logger.debug("\033[92m 开始爬取:{} \033[0m".format(url)) details = [] for href in hrefs: try: item = self._extract(href, url) if not item: continue details.append(item) except IndexError: # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬 continue NewsPipeline().save(details)
def parse(self): resp = rget(self.site_url) html = etree.HTML(resp.content) typeHrefs = html.xpath('//div[@class="sub_nav"]/div[@class="wrapper"]/ul/li//a/@href') for url in typeHrefs: pages = [urljoin(url, 'p{}.html'.format(page)) for page in range(1, 5)] pages[0] = url details = [] for page_url in pages: resp = rget(page_url) if not resp: continue html = etree.HTML(resp.content) hrefs = set(html.xpath('//dl[position()<last()]//a/@href|//a/@href')) for href in hrefs: try: item = self._extract(href, page_url) if not item: continue details.append(item) except IndexError: # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬 continue NewsPipeline().save(details)
def parse(self): pages = [urljoin(self.site_url, 'list_{}.html'.format(page)) for page in range(1, 494)] details = [] for page_url in pages: resp = rget(page_url) if not resp: continue html = etree.HTML(resp.content) hrefs = html.xpath('//div[@class="newlist"]//h6//a[2]/@href') for href in hrefs: try: item = self._extract(href, page_url) if not item: continue details.append(item) except IndexError: # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬 continue NewsPipeline().save(details)
def _extract(self, href, referer): resp = rget(href, referer=referer) if not resp: return html = etree.HTML(resp.content) if not html: return title = html.xpath('//td[@class="article_title1a"]/h1/text()') # title = title if title else html.xpath('//h1[@class="nw"]/text()') if title: title = title[0] else: return tag = html.xpath( '//div[@id="pagecenter"]/table/tr[2]/td[1]/table/tr/td/a[3]/text()' ) tag = tag[0] if tag else '-1' publish_time = html.xpath('//td[@align="center"]//text()') publish_time = publish_time[0].split( '\xa0')[0] if publish_time else '-1' ps = html.xpath('//td[@class="article_title2a"]//text()') sText = ''.join(ps) if len(sText) <= 100: content = trim(sText) else: sText = sText.split('。') content = trim('。&&&'.join(sText)) if filter_(content) or not content: return logger.debug( '\033[96m title:{}; href:{}; tag:{}; content:{}\033[0m'.format( title, href, tag, len(content))) return { 'category': '手表', 'site': self.site, 'tag': tag, 'news_url': href, 'title': title, 'content': content, 'author': '', 'publish_time': publish_time, }
def _extract(self, href, referer): resp = rget(href, referer=referer) if not resp: return html = etree.HTML(resp.content) if not html: return title = html.xpath('//*[@class="title"]/h1/text()') if title: title = title[0] else: return tag = html.xpath('//*[@class="breadcrumb left"]/p/a[2]/text()') tag = tag[0] if tag else '-1' publish_time = html.xpath('//*[@class="article-attr"]/span[1]/text()') publish_time = publish_time[0] if publish_time else '' author = html.xpath('//*[@class="article-attr"]/span[4]/text()') author = author[0].split(':')[1] if author else '' ps = html.xpath('//*[@class="article"]//p/text()') sText = ''.join(ps) if len(sText) <= 100: content = trim(sText) else: sText = sText.split('。') content = trim('。&&&'.join(sText)) if filter_(content) or not content: return logger.debug('\033[96m title:{}; href:{}; tag:{}; content:{}\033[0m' .format(title, href, tag, len(content))) return { 'category': '手表', 'site': self.site, 'tag': tag, 'news_url': href, 'title': title, 'content': content, 'author': author, 'publish_time': publish_time, }
def parse(self): url = 'http://m.sohu.com/ch/23/' resp = rget(url) html = etree.HTML(resp.content) hrefs = html.xpath('//div[@class="swiper-wrapper"]/div/a/@href') + \ html.xpath('//ul[@class="feed-list-area"]//li/a/@href') if not hrefs: self.parse() details = [] for href in hrefs: if href.startswith('http'): continue time.sleep(1) try: href = urljoin(self.url, href) logger.debug("\033[92m 开始爬取:{} \033[0m".format(href)) item = self._extract(href, url) if not item: continue details.append(item) except IndexError: # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬 continue NewsPipeline().save(details)
def parse(self): pages_url = [self.site_url] pages_url += [ urljoin(self.site_url, 'newsp{}.html'.format(page)) for page in range(2, 20) ] for page_url in pages_url: details = [] resp = rget(page_url) if not resp: continue html = etree.HTML(resp.content) hrefs = html.xpath('//div[@id="brand"]/table//tr/td/h3/a/@href') for href in hrefs: href = urljoin(self.site_url, href) try: item = self._extract(href, page_url) if not item: continue details.append(item) except IndexError: # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬 continue NewsPipeline().save(details)
def _extract(self, href, referer): resp = rget(href, referer=referer) if not resp: return html = etree.HTML(resp.content) if len(html) is None: return tag = ''.join(html.xpath('//div[@class="h"]/a[last()]/text()')) title = ''.join(html.xpath('//font[@class="f5"]/text()')) other = trim(''.join(html.xpath('//font[@class="f3"]/text()'))) other = re.findall('发布时间:(\d{4}-\d{2}-\d{2})来源:(\w+)', other) if other: publish_time = other[0][0] author = other[0][1] else: publish_time = author = '' content = ''.join( html.xpath( '//div[@class="mcontent"]//p[string-length(text()) >1]/text()') ) if content: content = trim('。&&&'.join(content.split('。'))) else: content = ''.join(html.xpath('//div[@class="mcontent"]//text()')) if filter_(content) or not content: return logger.debug( '\033[96m title:{}; href:{}; tag:{}; content:{} \033[0m'.format( title, href, tag, len(content))) return { 'category': '包包', 'site': self.site, 'tag': tag, 'news_url': href, 'title': title, 'content': content, 'author': author, 'publish_time': publish_time, }
def _extract(self, href, referer): resp = rget(href, referer=referer) if not resp: return html = etree.HTML(resp.content) if not html: return title = ''.join(html.xpath('//title/text()')) if title: title = trim(title) else: return publish_time = re.findall( r"time: '(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})'?", resp.text) publish_time = publish_time[0] if publish_time else '' author = trim(''.join(re.findall(r"name: '(\w+)'?", resp.text))) tag = ','.join(re.findall(r'{"name":"(\w+)"}\]?', resp.text)) content = ''.join(re.findall(r"content: '(.+)'?", resp.text)) if content: content = trim(content) content = re.sub('[<>"pa-z\/#3D\.-:_]', '', content) content = '。&&&'.join(content.split('。')) else: return logger.debug('\033[96m title:{}; href:{}; content:{} \033[0m'.format( title, href, len(content))) return { 'category': 'news', 'site': self.url, 'tag': tag, 'news_url': href, 'title': title, 'content': content, 'author': author, 'publish_time': publish_time, }
def _extract(self, href, referer): resp = rget(href, referer=referer) if not resp: return html = etree.HTML(resp.content) if len(html) is None: return title = ''.join(html.xpath('//font[@class="f5 f6"]/text()')) tag = ''.join(html.xpath('//div[@id="ur_here"]/a[2]/text()')) other = ''.join(html.xpath('//font[@class="f3"]/text()')).split('/') publish_time = other[1].strip() author = other[0].strip() if other[0] else '-1' content = ''.join( html.xpath( '//div[@class="box_1"]/div//span[string-length(text()) >1]/text()' )) if not content: content = ''.join( html.xpath( '//div[@class="box_1"]/div//p[string-length(text()) >1]/text()' )) content = trim('。&&&'.join(content.split('。'))) if filter_(content) and not content: return logger.debug( '\033[96m title:{}; href:{}; tag:{}; content:{}; \033[0m'.format( title, href, tag, len(content))) return { 'category': '包包', 'site': self.site, 'tag': tag, 'news_url': href, 'title': title, 'content': content, 'author': author, 'publish_time': publish_time, }
def _extract(self, href, referer): resp = rget(href, referer=referer) if not resp: self._extract(href, referer=referer) html = etree.HTML(resp.content) if not html: return title = ''.join(html.xpath('//h2[@class="title-info"]/text()')) if title: title = trim(title) else: return publish_time = trim(''.join( html.xpath('//footer[@class="time"]/text()'))) author = trim(''.join(html.xpath('//header[@class="name"]/text()'))) content = html.xpath('//div[@class="display-content"]//p/text()') + \ html.xpath('//div[@class="hidden-content hide"]//p/text()') content = ''.join(content) if content: content = trim('。&&&'.join(content.split('。'))) else: return logger.debug('\033[96m title:{}; href:{}; content:{} \033[0m'.format( title, href, len(content))) return { 'category': 'news', 'site': self.url, 'tag': '-1', 'news_url': href, 'title': title, 'content': content, 'author': author, 'publish_time': publish_time, }