示例#1
0
    def parse_list(self, response):
        query = response.meta['query']
        uri = response.meta['uri']
        log.msg(u'got response of query %s' % query, level=log.INFO)

        if empty(response.body_as_unicode()):
            log.msg('empty result', level=log.INFO)
            return Page(query=query, uri=uri, total=0, arts=[])

        def gen(trs):
            for tr in trs[2:-1:2][:MAX_ARTS]:
                yield Art(
                    title=tr.xpath('td[@class="c1"]/a/text()').extract()[0],
                    author=tr.xpath('td[@class="c2"]/a/text()').extract()[0],
                    company=tr.xpath('td[@class="c3"]/a/text()').extract()[0],
                    uri=urljoin(
                        BASE_URL,
                        tr.xpath('td[@class="c1"]/a/@href').extract()[0]),
                    status='reserve'
                    if u'予' in tr.xpath('td[@class="c7"]/text()').extract()
                    else 'other',
                )

        sel = Selector(response)
        try:
            trs = list(sel.xpath('//table[@class="FixFrame"]//tr'))
            return Page(query=query,
                        uri=uri,
                        total=total(sel),
                        arts=list(gen(trs)))
        except:
            log.msg('parse failed', level=log.ERROR)
            return Result(ok=False, query=query)
示例#2
0
    def parse_art(self, response):
        uri = response.meta['uri']
        sel = Selector(response)
        try:
            art = Art(
                title=sel.xpath(
                    '//td[@class="td_title_bar_r1c2"]/text()').extract()[0],
                author=sel.xpath(
                    '//td[@class="DetailData_L"]/a[contains(@href, "author")]/text()'
                ).extract(),
                company=sel.xpath(
                    '//td[@class="CircleName"]/a[1]/text()').extract()[0],
                uri=uri,
                status=status_in_art(sel),
            )
        except:
            log.msg('parse failed', level=log.ERROR)
            return Result(ok=False, query=uri)

        if 'page' not in response.meta:
            return Page(query=uri, uri=uri, total=1, arts=[art])
        else:
            page = response.meta['page']
            ranks = response.meta['ranks']
            page['arts'][ranks[uri]] = art
            if page_complete(page):
                return page
示例#3
0
    def parse_user_illustrations_uri(self, response):
        query = response.meta['query']
        uri = response.meta['uri']
        log.msg(u'got response of query %s' % uri)

        def gen(sel):
            author = sel.xpath('//h1[@class="user"]/text()').extract()[0]
            for a in sel.xpath('//a[@class="work"]')[:self.max_arts]:
                yield Art(
                    title=a.xpath('h1/@title').extract()[0],
                    author=author,
                    uri=urljoin(BASE_URL,
                                a.xpath('@href').extract()[0]),
                    thumbnail_uri=a.xpath('img/@src').extract()[0],
                )

        sel = Selector(response)
        try:
            return Page(
                query=query,
                uri=uri,
                total=sel.xpath(
                    '//*[@id="wrapper"]/div[1]/div[1]/div/span/text()').re(
                        r'\d+')[0],
                arts=list(gen(sel)),
            )
        except Exception as e:
            log.msg('parse failed', level=log.ERROR)
            return Result(ok=False, query=query, message=str(e))
示例#4
0
    def parse_complex_list(self, response):
        query = response.meta['query']
        uri = response.meta['uri']
        log.msg(u'got response of query %s' % query, level=log.INFO)

        if empty(response.body_as_unicode()):
            log.msg('empty result', level=log.INFO)
            yield Page(query=query, uri=uri, total=0, arts=[])
            return

        sel = Selector(response)
        try:
            uris = [
                urljoin(BASE_URL, url) for url in sel.xpath(
                    '//tr[@class="TBLdtil"]/td[@class="noi_c2"]/a/@href').
                extract()[:MAX_ARTS]
            ]
            log.msg('got %d arts' % len(uris))
            page = Page(query=query,
                        uri=uri,
                        total=total_complex(sel),
                        arts=[None] * len(uris))
            ranks = {uri: i for i, uri in enumerate(uris)}
            for uri in uris:
                req = self.make_art_request(uri)
                req.meta['page'] = page
                req.meta['ranks'] = ranks
                yield req
        except:
            log.msg('parse failed', level=log.ERROR)
            yield Result(ok=False, query=query)
示例#5
0
 def parse_bangumi(self, response):
     query = response.meta['query']
     try:
         return Bangumi(query=query,
                        content=json.loads(response.body_as_unicode()))
     except:
         log.msg('parse failed', level=log.ERROR)
         return Result(ok=False, query=query)
示例#6
0
 def parse_rss(self, response):
     query = response.meta['query']
     try:
         sel = Selector(response)
         return RSS(query=query,
                    arts=[make_art(sub) for sub in sel.xpath('//item')])
     except Exception as e:
         log.msg('parse failed: %s' % str(e), level=log.ERROR)
         return Result(ok=False, query=query)
示例#7
0
 def parse_user(self, response):
     query = response.meta['query']
     try:
         sel = Selector(response)
         return User(
             query=query,
             posts=[
                 make_post(sub)
                 for sub in sel.xpath('//div[@class="main_list"]/ul/li')
             ])
     except Exception as e:
         log.msg('parse failed: %s' % str(e), level=log.ERROR)
         return Result(ok=False, query=query)
示例#8
0
 def parse_ranking_uri(self, response):
     query = response.meta['query']
     try:
         pages = response.meta['pages']
         d = json.loads(response.body_as_unicode())
         pages[
             response.meta['page']] = [] if 'error' in d else d['contents']
         if None not in pages:
             arts = list(chain(*pages))
             return Page(
                 query=query,
                 uri=make_ranking_uri(query),
                 total=len(arts),
                 arts=arts,
             )
     except Exception as e:
         log.msg('parse failed, content: %s' % response.body_as_unicode(),
                 level=log.ERROR)
         return Result(ok=False, query=query, message=str(e))
示例#9
0
def failed(query, message):
    log.msg('parse failed: %s' % message, level=log.ERROR)
    return Result(ok=False, query=query, message=message)