Пример #1
0
    def parse(self):
        resp = rget(self.site_url)
        html = etree.HTML(resp.content)
        try:
            total_url = ''.join(
                html.xpath('//div[@id="pager"]/a[@class="last"]/@href'))
            pages = parse.parse_qs(parse.urlsplit(total_url).query)['page'][0]
        except:
            pages = 12
        urls = self._construct_page_url(int(pages) + 1)

        details = []
        for page_url in urls:
            resp = rget(page_url)
            if not resp: continue
            html = etree.HTML(resp.content)

            hrefs = html.xpath('//div[@class="art_cat_box"]/table//a/@href')
            for href in hrefs:
                try:
                    item = self._extract(href, page_url)
                    if not item: continue
                    details.append(item)
                except IndexError:
                    # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬
                    continue
            NewsPipeline().save(details)
Пример #2
0
    def parse(self):
        resp = rget('http://meilibaobao.com/artlist-217.html')
        html = etree.HTML(resp.content)

        try:
            last_page = ''.join(
                html.xpath('//td[@class="pagernum"]/a[last()]/text()'))
            last_page = int(last_page)
        except:
            last_page = 180

        pages = self._construct_pages(last_page + 1)
        details = []
        for page_url in pages:
            resp = rget(page_url)
            if not resp: continue
            html = etree.HTML(resp.content)

            divs = html.xpath('//div[@id="columns"]/div')
            for dd in divs:
                try:
                    href = ''.join(dd.xpath('./div[@class="pic"]/a/@href'))
                    item = self._extract(href, page_url)
                    if not item: continue
                    details.append(item)
                except IndexError:
                    # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬
                    continue
            NewsPipeline().save(details)
Пример #3
0
    def parse(self):
        resp = rget('http://www.milanstand.com/article-zixun-1/')
        html = etree.HTML(resp.content)

        try:
            last_page = html.xpath(
                '//p[@class="nx"]/following-sibling::p/a/@href')[0]
            last_page = int(last_page.split('-')[-1][:-1])
        except IndexError:
            last_page = 55

        pages = self._construct_pages(last_page + 1)
        details = []
        for page_url in pages:
            resp = rget(page_url)
            if not resp: continue
            html = etree.HTML(resp.content)

            divs = html.xpath(
                '//div[@class="box_3"]/table/tr//div[contains(text(), "包")]')
            for dd in divs:
                try:
                    href = ''.join(dd.xpath('./a/@href'))
                    href = urljoin(self.site_url, href)
                    item = self._extract(href, page_url)
                    if not item: continue
                    details.append(item)
                except IndexError:
                    # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬
                    continue
            NewsPipeline().save(details)
Пример #4
0
    def _extract(self, href, referer):
        resp = rget(href, referer=referer)
        if not resp: return
        html = etree.HTML(resp.content)
        if len(html) is None: return

        info = html.xpath('//div[@class="info"]/text()')
        tag = info[-1]
        publish_time = info[0].split('\xa0')[0]
        author = ''.join(html.xpath('//div[@class="info"]/a/text()'))

        title = ''.join(html.xpath('//div[@class="article_con"]/h1/text()'))

        content = ''.join(html.xpath('//div[@class="art_con"]//text()'))
        if content:
            content = trim('。&&&'.join(content.split('。')))
        else:
            content = ''.join(html.xpath('//div[@class="mcontent"]//text()'))
        if not content: return
        logger.debug(
            '\033[96m title:{}; href:{}; tag:{}; content:{} \033[0m'.format(
                title, href, tag, len(content)))
        return {
            'category': '包包',
            'site': self.site,
            'tag': tag,
            'news_url': href,
            'title': title,
            'content': content,
            'author': author,
            'publish_time': publish_time,
        }
Пример #5
0
    def _extract(self, href, referer):
        resp = rget(href, referer=referer)
        if not resp: self._extract(href, referer=referer)
        html = etree.HTML(resp.content)
        if not html: return

        title = ''.join(html.xpath('//*[@id="activity-name"]/text()'))
        if title:
            title = trim(title)
        else:
            return

        publish_time = re.findall(r'publish_time = "(\d{4}-\d{2}-\d{2})"?',
                                  resp.text)
        publish_time = publish_time[0] if publish_time else ''
        author = trim(''.join(html.xpath('//*[@id="js_name"]/text()')))

        content = ''.join(html.xpath('//*[@id="js_content"]//text()'))
        if content:
            content = trim('。&&&'.join(content.split('。')))
        else:
            return
        logger.debug('\033[96m title:{}; href:{}; content:{} \033[0m'.format(
            title, href, len(content)))

        return {
            'category': 'news',
            'site': self.url,
            'tag': -1,
            'news_url': href,
            'title': title,
            'content': content,
            'author': author,
            'publish_time': publish_time,
        }
Пример #6
0
 def parse(self):
     #时尚, 八卦, 旅游, 养生
     categorys = [9, 4, 11, 2]
     purl = 'http://weixin.sogou.com/pcindex/pc/pc_{category}/{page}.html'
     for category in categorys:
         urls = [
             purl.format(page=page, category=category)
             for page in range(1, 5)
         ]
         urls.insert(
             0,
             'http://weixin.sogou.com/pcindex/pc/pc_{category}/pc_{category}.html'
             .format(category=category))
         for url in urls:
             resp = rget(url)
             if not resp: continue
             html = etree.HTML(resp.content)
             hrefs = html.xpath(
                 '//ul[@id="pc_0_0"]//li/div[@class="txt-box"]/h3/a/@href')
             if not hrefs:
                 hrefs = html.xpath('//li/div[@class="img-box"]/a/@href')
             if not hrefs:
                 import pdb
                 pdb.set_trace()
             logger.debug("\033[92m 开始爬取:{} \033[0m".format(url))
             details = []
             for href in hrefs:
                 try:
                     item = self._extract(href, url)
                     if not item: continue
                     details.append(item)
                 except IndexError:
                     # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬
                     continue
             NewsPipeline().save(details)
Пример #7
0
    def parse(self):
        resp = rget(self.site_url)
        html = etree.HTML(resp.content)
        typeHrefs = html.xpath('//div[@class="sub_nav"]/div[@class="wrapper"]/ul/li//a/@href')

        for url in typeHrefs:
            pages = [urljoin(url, 'p{}.html'.format(page)) for page in range(1, 5)]
            pages[0] = url
            details = []
            for page_url in pages:
                resp = rget(page_url)
                if not resp: continue
                html = etree.HTML(resp.content)

                hrefs = set(html.xpath('//dl[position()<last()]//a/@href|//a/@href'))
                for href in hrefs:
                    try:
                        item = self._extract(href, page_url)
                        if not item: continue
                        details.append(item)
                    except IndexError:
                        # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬
                        continue
                NewsPipeline().save(details)
Пример #8
0
    def parse(self):
        pages = [urljoin(self.site_url, 'list_{}.html'.format(page)) for page in range(1, 494)]
        details = []
        for page_url in pages:
            resp = rget(page_url)
            if not resp: continue
            html = etree.HTML(resp.content)

            hrefs = html.xpath('//div[@class="newlist"]//h6//a[2]/@href')
            for href in hrefs:
                try:
                    item = self._extract(href, page_url)
                    if not item: continue
                    details.append(item)
                except IndexError:
                    # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬
                    continue
            NewsPipeline().save(details)
Пример #9
0
    def _extract(self, href, referer):
        resp = rget(href, referer=referer)
        if not resp: return
        html = etree.HTML(resp.content)
        if not html: return

        title = html.xpath('//td[@class="article_title1a"]/h1/text()')
        # title = title if title else html.xpath('//h1[@class="nw"]/text()')
        if title:
            title = title[0]
        else:
            return

        tag = html.xpath(
            '//div[@id="pagecenter"]/table/tr[2]/td[1]/table/tr/td/a[3]/text()'
        )
        tag = tag[0] if tag else '-1'
        publish_time = html.xpath('//td[@align="center"]//text()')
        publish_time = publish_time[0].split(
            '\xa0')[0] if publish_time else '-1'

        ps = html.xpath('//td[@class="article_title2a"]//text()')
        sText = ''.join(ps)
        if len(sText) <= 100:
            content = trim(sText)
        else:
            sText = sText.split('。')
            content = trim('。&&&'.join(sText))

        if filter_(content) or not content: return
        logger.debug(
            '\033[96m title:{}; href:{}; tag:{}; content:{}\033[0m'.format(
                title, href, tag, len(content)))
        return {
            'category': '手表',
            'site': self.site,
            'tag': tag,
            'news_url': href,
            'title': title,
            'content': content,
            'author': '',
            'publish_time': publish_time,
        }
Пример #10
0
    def _extract(self, href, referer):
        resp = rget(href, referer=referer)
        if not resp: return
        html = etree.HTML(resp.content)
        if not html: return

        title = html.xpath('//*[@class="title"]/h1/text()')
        if title:
            title = title[0]
        else:
            return

        tag = html.xpath('//*[@class="breadcrumb left"]/p/a[2]/text()')
        tag = tag[0] if tag else '-1'
        publish_time = html.xpath('//*[@class="article-attr"]/span[1]/text()')
        publish_time = publish_time[0] if publish_time else ''
        author = html.xpath('//*[@class="article-attr"]/span[4]/text()')
        author = author[0].split(':')[1] if author else ''

        ps = html.xpath('//*[@class="article"]//p/text()')
        sText = ''.join(ps)
        if len(sText) <= 100:
            content = trim(sText)
        else:
            sText = sText.split('。')
            content = trim('。&&&'.join(sText))

        if filter_(content) or not content: return
        logger.debug('\033[96m title:{}; href:{}; tag:{}; content:{}\033[0m'
                             .format(title, href, tag, len(content)))
        return {
            'category': '手表',
            'site': self.site,
            'tag': tag,
            'news_url': href,
            'title': title,
            'content': content,
            'author': author,
            'publish_time': publish_time,
        }
Пример #11
0
 def parse(self):
     url = 'http://m.sohu.com/ch/23/'
     resp = rget(url)
     html = etree.HTML(resp.content)
     hrefs = html.xpath('//div[@class="swiper-wrapper"]/div/a/@href') + \
         html.xpath('//ul[@class="feed-list-area"]//li/a/@href')
     if not hrefs:
         self.parse()
     details = []
     for href in hrefs:
         if href.startswith('http'): continue
         time.sleep(1)
         try:
             href = urljoin(self.url, href)
             logger.debug("\033[92m 开始爬取:{} \033[0m".format(href))
             item = self._extract(href, url)
             if not item: continue
             details.append(item)
         except IndexError:
             # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬
             continue
     NewsPipeline().save(details)
Пример #12
0
 def parse(self):
     pages_url = [self.site_url]
     pages_url += [
         urljoin(self.site_url, 'newsp{}.html'.format(page))
         for page in range(2, 20)
     ]
     for page_url in pages_url:
         details = []
         resp = rget(page_url)
         if not resp: continue
         html = etree.HTML(resp.content)
         hrefs = html.xpath('//div[@id="brand"]/table//tr/td/h3/a/@href')
         for href in hrefs:
             href = urljoin(self.site_url, href)
             try:
                 item = self._extract(href, page_url)
                 if not item: continue
                 details.append(item)
             except IndexError:
                 # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬
                 continue
         NewsPipeline().save(details)
Пример #13
0
    def _extract(self, href, referer):
        resp = rget(href, referer=referer)
        if not resp: return
        html = etree.HTML(resp.content)
        if len(html) is None: return

        tag = ''.join(html.xpath('//div[@class="h"]/a[last()]/text()'))
        title = ''.join(html.xpath('//font[@class="f5"]/text()'))

        other = trim(''.join(html.xpath('//font[@class="f3"]/text()')))
        other = re.findall('发布时间:(\d{4}-\d{2}-\d{2})来源:(\w+)', other)
        if other:
            publish_time = other[0][0]
            author = other[0][1]
        else:
            publish_time = author = ''

        content = ''.join(
            html.xpath(
                '//div[@class="mcontent"]//p[string-length(text()) >1]/text()')
        )
        if content:
            content = trim('。&&&'.join(content.split('。')))
        else:
            content = ''.join(html.xpath('//div[@class="mcontent"]//text()'))
        if filter_(content) or not content: return
        logger.debug(
            '\033[96m title:{}; href:{}; tag:{}; content:{} \033[0m'.format(
                title, href, tag, len(content)))
        return {
            'category': '包包',
            'site': self.site,
            'tag': tag,
            'news_url': href,
            'title': title,
            'content': content,
            'author': author,
            'publish_time': publish_time,
        }
Пример #14
0
    def _extract(self, href, referer):
        resp = rget(href, referer=referer)
        if not resp: return
        html = etree.HTML(resp.content)
        if not html: return

        title = ''.join(html.xpath('//title/text()'))
        if title:
            title = trim(title)
        else:
            return

        publish_time = re.findall(
            r"time: '(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})'?", resp.text)
        publish_time = publish_time[0] if publish_time else ''
        author = trim(''.join(re.findall(r"name: '(\w+)'?", resp.text)))
        tag = ','.join(re.findall(r'{"name":"(\w+)"}\]?', resp.text))

        content = ''.join(re.findall(r"content: '(.+)'?", resp.text))
        if content:
            content = trim(content)
            content = re.sub('[&lt&gt&quot;pa-z\/#3D\.-:_]', '', content)
            content = '。&&&'.join(content.split('。'))
        else:
            return
        logger.debug('\033[96m title:{}; href:{}; content:{} \033[0m'.format(
            title, href, len(content)))

        return {
            'category': 'news',
            'site': self.url,
            'tag': tag,
            'news_url': href,
            'title': title,
            'content': content,
            'author': author,
            'publish_time': publish_time,
        }
Пример #15
0
    def _extract(self, href, referer):
        resp = rget(href, referer=referer)
        if not resp: return
        html = etree.HTML(resp.content)
        if len(html) is None: return

        title = ''.join(html.xpath('//font[@class="f5 f6"]/text()'))
        tag = ''.join(html.xpath('//div[@id="ur_here"]/a[2]/text()'))

        other = ''.join(html.xpath('//font[@class="f3"]/text()')).split('/')
        publish_time = other[1].strip()
        author = other[0].strip() if other[0] else '-1'

        content = ''.join(
            html.xpath(
                '//div[@class="box_1"]/div//span[string-length(text()) >1]/text()'
            ))
        if not content:
            content = ''.join(
                html.xpath(
                    '//div[@class="box_1"]/div//p[string-length(text()) >1]/text()'
                ))
        content = trim('。&&&'.join(content.split('。')))
        if filter_(content) and not content: return
        logger.debug(
            '\033[96m title:{}; href:{}; tag:{}; content:{}; \033[0m'.format(
                title, href, tag, len(content)))
        return {
            'category': '包包',
            'site': self.site,
            'tag': tag,
            'news_url': href,
            'title': title,
            'content': content,
            'author': author,
            'publish_time': publish_time,
        }
Пример #16
0
    def _extract(self, href, referer):
        resp = rget(href, referer=referer)
        if not resp: self._extract(href, referer=referer)
        html = etree.HTML(resp.content)
        if not html: return

        title = ''.join(html.xpath('//h2[@class="title-info"]/text()'))
        if title:
            title = trim(title)
        else:
            return

        publish_time = trim(''.join(
            html.xpath('//footer[@class="time"]/text()')))
        author = trim(''.join(html.xpath('//header[@class="name"]/text()')))

        content = html.xpath('//div[@class="display-content"]//p/text()') + \
            html.xpath('//div[@class="hidden-content hide"]//p/text()')
        content = ''.join(content)
        if content:
            content = trim('。&&&'.join(content.split('。')))
        else:
            return
        logger.debug('\033[96m title:{}; href:{}; content:{} \033[0m'.format(
            title, href, len(content)))

        return {
            'category': 'news',
            'site': self.url,
            'tag': '-1',
            'news_url': href,
            'title': title,
            'content': content,
            'author': author,
            'publish_time': publish_time,
        }