Exemplo n.º 1
0
 def parse_username(self, response):
     query = response.meta['query']
     if response.status == 404:
         return failed(query, '404')
     sel = Selector(response)
     try:
         if not sel.xpath('//div[@class="_no-item"]'):
             items = list(sel.xpath('//li[@class="user-recommendation-item"]'))
             if len(items) == 0:
                 return failed(query, 'inconsist username search result', response=response)
             if len(items) == 1:
                 user_id = items[0].xpath('.//a[@class="title"]/@href').extract()[0].split('=')[-1]
                 check_user_id(user_id)
                 return self._make_user_illustrations_uri_request(
                     USER_ILLUSTRATIONS_URL_TEMPLATE % user_id,
                     query
                 )
         return Request(
             SEARCH_USER_URL + '?' + urlencode({
                 's_mode': 's_usr',
                 'nick': query['username'].encode('utf-8'),
             }),
             headers=self._make_headers(),
             callback=self.parse_username_recommendations,
             meta=dict(query=query),
             dont_filter=True,
         )
     except:
         return failed(query, traceback.format_exc(), response=response)
Exemplo n.º 2
0
    def parse_list(self, response):
        query = response.meta['query']
        if busy(response.body_as_unicode()):
            return failed(query, 'tora busy', expected=True)
        uri = response.meta['uri']
        log.msg(u'got response of query %s' % query, level=log.INFO)

        if empty(response.body_as_unicode()):
            log.msg('empty result', level=log.INFO)
            return Page(query=query, uri=uri, total=0, arts=[])

        def gen(trs):
            for tr in trs[2:-1:2][:MAX_ARTS]:
                yield Art(
                    title=tr.xpath('td[@class="c1"]/a/text()').extract()[0],
                    author=tr.xpath('td[@class="c2"]/a/text()').extract()[0],
                    company=tr.xpath('td[@class="c3"]/a/text()').extract()[0],
                    uri=urljoin(
                        BASE_URL,
                        tr.xpath('td[@class="c1"]/a/@href').extract()[0]),
                    status='reserve'
                    if u'予' in tr.xpath('td[@class="c7"]/text()').extract()
                    else 'other',
                )

        sel = Selector(response)
        try:
            trs = list(sel.xpath('//table[@class="FixFrame"]//tr'))
            return Page(query=query,
                        uri=uri,
                        total=total(sel),
                        arts=list(gen(trs)))
        except:
            return failed(query, traceback.format_exc(), response=response)
Exemplo n.º 3
0
    def parse_complex_list(self, response):
        query = response.meta['query']
        if busy(response.body_as_unicode()):
            yield failed(query, 'tora busy', expected=True)
            return
        uri = response.meta['uri']
        log.msg(u'got response of query %s' % query, level=log.INFO)

        if empty(response.body_as_unicode()):
            log.msg('empty result', level=log.INFO)
            yield Page(query=query, uri=uri, total=0, arts=[])
            return

        sel = Selector(response)
        try:
            uris = [
                urljoin(BASE_URL, url) for url in sel.xpath(
                    '//tr[@class="TBLdtil"]/td[@class="noi_c2"]/a/@href').
                extract()[:MAX_ARTS]
            ]
            log.msg('got %d arts' % len(uris))
            page = Page(query=query,
                        uri=uri,
                        total=total_complex(sel),
                        arts=[None] * len(uris))
            ranks = {uri: i for i, uri in enumerate(uris)}
            for uri in uris:
                req = self.make_art_request(uri)
                req.meta['page'] = page
                req.meta['ranks'] = ranks
                yield req
        except:
            yield failed(query, traceback.format_exc(), response=response)
Exemplo n.º 4
0
    def parse_art(self, response):
        uri = response.meta['uri']
        if busy(response.body_as_unicode()):
            return failed(uri, 'tora busy', expected=True)
        sel = Selector(response)
        try:
            art = Art(
                title=sel.xpath('//td[@class="td_title_bar_r1c2"]/text()').extract()[0],
                author=sel.xpath('//td[@class="DetailData_L"]/a[contains(@href, "author")]/text()').extract(),
                company=sel.xpath('//td[@class="CircleName"]/a[1]/text()').extract()[0],
                uri=uri,
                status=status_in_art(sel),
            )
        except:
            return failed(uri, traceback.format_exc(), response=response)

        if 'page' not in response.meta:
            return Page(
                query=uri,
                uri=uri,
                total=1,
                arts=[art]
            )
        else:
            page = response.meta['page']
            ranks = response.meta['ranks']
            page['arts'][ranks[uri]] = art
            if page_complete(page):
                return page
Exemplo n.º 5
0
    def parse_art(self, response):
        uri = response.meta['uri']
        if busy(response.body_as_unicode()):
            return failed(uri, 'tora busy', expected=True)
        sel = Selector(response)
        try:
            art = Art(
                title=sel.xpath(
                    '//td[@class="td_title_bar_r1c2"]/text()').extract()[0],
                author=sel.xpath(
                    '//td[@class="DetailData_L"]/a[contains(@href, "author")]/text()'
                ).extract(),
                company=sel.xpath(
                    '//td[@class="CircleName"]/a[1]/text()').extract()[0],
                uri=uri,
                status=status_in_art(sel),
            )
        except:
            return failed(uri, traceback.format_exc(), response=response)

        if 'page' not in response.meta:
            return Page(query=uri, uri=uri, total=1, arts=[art])
        else:
            page = response.meta['page']
            ranks = response.meta['ranks']
            page['arts'][ranks[uri]] = art
            if page_complete(page):
                return page
Exemplo n.º 6
0
 def parse_username(self, response):
     query = response.meta['query']
     if response.status == 404:
         return failed(query, '404')
     sel = Selector(response)
     try:
         if not sel.xpath('//div[@class="_no-item"]'):
             items = list(
                 sel.xpath('//li[@class="user-recommendation-item"]'))
             if len(items) == 0:
                 return failed(query,
                               'inconsist username search result',
                               response=response)
             if len(items) == 1:
                 user_id = items[0].xpath('.//a[@class="title"]/@href'
                                          ).extract()[0].split('=')[-1]
                 check_user_id(user_id)
                 return self._make_user_illustrations_uri_request(
                     USER_ILLUSTRATIONS_URL_TEMPLATE % user_id, query)
         return Request(
             SEARCH_USER_URL + '?' +
             urlencode({
                 's_mode': 's_usr',
                 'nick': query['username'].encode('utf-8'),
             }),
             headers=self._make_headers(),
             callback=self.parse_username_recommendations,
             meta=dict(query=query),
             dont_filter=True,
         )
     except:
         return failed(query, traceback.format_exc(), response=response)
Exemplo n.º 7
0
 def parse_tags(self, response):
     query = response.meta['query']
     try:
         return Tags(query=query,
                     content=json.loads(response.body_as_unicode()))
     except ValueError:
         return failed(query, 'yande.re busy', expected=True)
     except:
         return failed(query, traceback.format_exc(), response=response)
Exemplo n.º 8
0
 def parse_tags(self, response):
     query = response.meta['query']
     try:
         return Tags(
             query=query,
             content=json.loads(response.body_as_unicode())
         )
     except ValueError:
         return failed(query, 'yande.re busy', expected=True)
     except:
         return failed(query, traceback.format_exc(), response=response)
Exemplo n.º 9
0
 def parse_username_recommendations(self, response):
     query = response.meta['query']
     if response.status == 404:
         return failed(query, '404')
     sel = Selector(response)
     try:
         return SearchUserPage(query=query,
                               uri=response.url,
                               total=0,
                               arts=[],
                               recommendations=list(
                                   gen_recommendations(sel)))
     except:
         return failed(query, traceback.format_exc(), response=response)
Exemplo n.º 10
0
 def parse_username_recommendations(self, response):
     query = response.meta['query']
     if response.status == 404:
         return failed(query, '404')
     sel = Selector(response)
     try:
         return SearchUserPage(
             query=query,
             uri=response.url,
             total=0,
             arts=[],
             recommendations=list(gen_recommendations(sel))
         )
     except:
         return failed(query, traceback.format_exc(), response=response)
Exemplo n.º 11
0
 def parse_gist(self, response):
     query = response.meta['query']
     try:
         gist = Gist(uri=response.url,
                     query=query,
                     meta=json.loads(response.body_as_unicode()),
                     files=[])
         files = gist['meta'].get('files')
         if files is None:
             yield failed(query, response.body_as_unicode())
             return
         for name, info in files.items():
             yield self.make_file_request(name, info, gist)
     except:
         yield failed(query, traceback.format_exc(), response=response)
Exemplo n.º 12
0
 def parse_uri(self, response):
     query = response.meta['query']
     try:
         feed = feedparser.parse(response.body)
         if feed.bozo:
             return failed(query,
                           'ill formed xml on line {}: {}'.format(
                               feed.bozo_exception.getLineNumber(),
                               feed.bozo_exception.getMessage()),
                           response=response)
         return Feed(uri=response.url,
                     query=query,
                     data=json.loads(jsonpickle.encode(feed)))
     except:
         return failed(query, traceback.format_exc(), response=response)
Exemplo n.º 13
0
 def parse_uri(self, response):
     query = response.meta['query']
     try:
         feed = feedparser.parse(response.body)
         if feed.bozo:
             return failed(query, 'ill formed xml on line {}: {}'.format(
                 feed.bozo_exception.getLineNumber(),
                 feed.bozo_exception.getMessage()
             ), response=response)
         return Feed(
             uri=response.url,
             query=query,
             data=json.loads(jsonpickle.encode(feed))
         )
     except:
         return failed(query, traceback.format_exc(), response=response)
Exemplo n.º 14
0
 def parse_gist(self, response):
     query = response.meta['query']
     try:
         gist = Gist(
             uri=response.url,
             query=query,
             meta=json.loads(response.body_as_unicode()),
             files=[]
         )
         files = gist['meta'].get('files')
         if files is None:
             yield failed(query, response.body_as_unicode())
             return
         for name, info in files.items():
             yield self.make_file_request(name, info, gist)
     except:
         yield failed(query, traceback.format_exc(), response=response)
Exemplo n.º 15
0
 def parse_file(self, response):
     gist = response.meta['gist']
     try:
         gist['files'].append(File(
             name=response.meta['name'],
             content=base64.b64encode(response.body)
         ))
         if len(gist['files']) == len(gist['meta']['files']):
             return gist
     except:
         return failed(gist['query'], traceback.format_exc(), response=response)
Exemplo n.º 16
0
 def parse_posts(self, response):
     query = response.meta['query']
     uri = response.meta['uri']
     try:
         return Posts(query=query,
                      uri=uri,
                      posts=json.loads(response.body_as_unicode()))
     except ValueError as e:
         return failed(query, 'yande.re busy', expected=True)
     except Exception as e:
         return self.failed(query, str(e))
Exemplo n.º 17
0
 def parse_file(self, response):
     gist = response.meta['gist']
     try:
         gist['files'].append(
             File(name=response.meta['name'],
                  content=base64.b64encode(response.body)))
         if len(gist['files']) == len(gist['meta']['files']):
             return gist
     except:
         return failed(gist['query'],
                       traceback.format_exc(),
                       response=response)
Exemplo n.º 18
0
 def parse_ranking_uri(self, response):
     query = response.meta['query']
     if response.status == 404:
         return failed(query, '404')
     try:
         pages = response.meta['pages']
         try:
             d = json.loads(response.body_as_unicode())
         except ValueError:
             return failed(query, 'pixiv busy', expected=True)
         pages[response.meta['page']] = [] if 'error' in d else d['contents']
         if None not in pages:
             arts = list(chain(*pages))
             return Page(
                 query=query,
                 uri=make_ranking_uri(query),
                 total=len(arts),
                 arts=arts,
             )
     except:
         return failed(query, traceback.format_exc(), response=response)
Exemplo n.º 19
0
    def parse_user_illustrations_uri(self, response):
        query = response.meta['query']
        if response.status == 404:
            return failed(query, '404', expected=True)
        uri = response.meta['uri']
        log.msg(u'got response of query %s' % uri)

        sel = Selector(response)
        try:
            total = int(sel.xpath('//*[@id="wrapper"]//span[@class="count-badge"]/text()').re(r'\d+')[0])
            arts = list(parse_user_arts(sel))[:self.max_arts]
            if not arts and total > 0:
                return failed(query, 'data inconsist', response=response)
            return Page(
                query=query,
                uri=uri,
                total=total,
                arts=arts,
            )
        except:
            return failed(query, traceback.format_exc(), response=response)
Exemplo n.º 20
0
 def parse(self, response):
     query = response.meta['query']
     try:
         data = dict(status=response.status,
                     headers=dict(response.headers),
                     body=base64.b64encode(response.body))
         payload = response.meta.get('payload')
         if payload:
             data['payload'] = payload
         return Response(data=data, query=query)
     except:
         return failed(query, traceback.format_exc(), response=response)
Exemplo n.º 21
0
 def parse(self, response):
     query = response.meta['query']
     try:
         sel = Selector(response)
         return Page(
             uri=response.url,
             query=query,
             posts=[
                 make_post(sub) for sub in sel.xpath(
                     '//table[@class="itg"]/tr[starts-with(@class, "gtr")]')
             ])
     except:
         return failed(query, traceback.format_exc(), response=response)
Exemplo n.º 22
0
 def parse(self, response):
     query = response.meta['query']
     try:
         sel = Selector(response)
         return Page(
             uri=response.url,
             query=query,
             posts=[make_post(sub) for sub in sel.xpath(
                 '//table[@class="itg"]/tr[starts-with(@class, "gtr")]'
             )]
         )
     except:
         return failed(query, traceback.format_exc(), response=response)
Exemplo n.º 23
0
 def parse_ranking_uri(self, response):
     query = response.meta['query']
     if response.status == 404:
         return failed(query, '404')
     try:
         pages = response.meta['pages']
         try:
             d = json.loads(response.body_as_unicode())
         except ValueError:
             return failed(query, 'pixiv busy', expected=True)
         pages[
             response.meta['page']] = [] if 'error' in d else d['contents']
         if None not in pages:
             arts = list(chain(*pages))
             return Page(
                 query=query,
                 uri=make_ranking_uri(query),
                 total=len(arts),
                 arts=arts,
             )
     except:
         return failed(query, traceback.format_exc(), response=response)
Exemplo n.º 24
0
 def parse_posts(self, response):
     query = response.meta['query']
     uri = response.meta['uri']
     try:
         return Posts(
             query=query,
             uri=uri,
             posts=json.loads(response.body_as_unicode())
         )
     except ValueError as e:
         return failed(query, 'yande.re busy', expected=True)
     except Exception as e:
         return self.failed(query, str(e))
Exemplo n.º 25
0
def checkbusy(f):
    check = lambda response: md5(response.body).hexdigest() in BUSY_BODY_MD5_LIST
    make = lambda response: failed(response.meta['query'], 'pixiv busy', expected=True)
    if inspect.isgeneratorfunction(f):
        def inner(self, response):
            if check(response):
                yield make()
                return
            for ret in f(self, response):
                yield ret
    else:
        def inner(self, response):
            return make() if check(response) else f(self, response)
    return inner
Exemplo n.º 26
0
    def parse_list(self, response):
        query = response.meta['query']
        if busy(response.body_as_unicode()):
            return failed(query, 'tora busy', expected=True)
        uri = response.meta['uri']
        log.msg(u'got response of query %s' % query, level=log.INFO)

        if empty(response.body_as_unicode()):
            log.msg('empty result', level=log.INFO)
            return Page(
                query=query,
                uri=uri,
                total=0,
                arts=[]
            )

        def gen(trs):
            for tr in trs[2:-1:2][:MAX_ARTS]:
                yield Art(
                    title=tr.xpath('td[@class="c1"]/a/text()').extract()[0],
                    author=tr.xpath('td[@class="c2"]/a/text()').extract()[0],
                    company=tr.xpath('td[@class="c3"]/a/text()').extract()[0],
                    uri=urljoin(BASE_URL, tr.xpath('td[@class="c1"]/a/@href').extract()[0]),
                    status='reserve' if u'予' in tr.xpath('td[@class="c7"]/text()').extract() else 'other',
                )

        sel = Selector(response)
        try:
            trs = list(sel.xpath('//table[@class="FixFrame"]//tr'))
            return Page(
                query=query,
                uri=uri,
                total=total(sel),
                arts=list(gen(trs))
            )
        except:
            return failed(query, traceback.format_exc(), response=response)
Exemplo n.º 27
0
    def parse_complex_list(self, response):
        query = response.meta['query']
        if busy(response.body_as_unicode()):
            yield failed(query, 'tora busy', expected=True)
            return
        uri = response.meta['uri']
        log.msg(u'got response of query %s' % query, level=log.INFO)

        if empty(response.body_as_unicode()):
            log.msg('empty result', level=log.INFO)
            yield Page(
                query=query,
                uri=uri,
                total=0,
                arts=[]
            )
            return

        sel = Selector(response)
        try:
            uris = [urljoin(BASE_URL, url) for url in sel.xpath('//tr[@class="TBLdtil"]/td[@class="noi_c2"]/a/@href').extract()[:MAX_ARTS]]
            log.msg('got %d arts' % len(uris))
            page = Page(
                query=query,
                uri=uri,
                total=total_complex(sel),
                arts=[None] * len(uris)
            )
            ranks = {uri: i for i, uri in enumerate(uris)}
            for uri in uris:
                req = self.make_art_request(uri)
                req.meta['page'] = page
                req.meta['ranks'] = ranks
                yield req
        except:
            yield failed(query, traceback.format_exc(), response=response)
Exemplo n.º 28
0
    def parse_user_illustrations_uri(self, response):
        query = response.meta['query']
        if response.status == 404:
            return failed(query, '404', expected=True)
        uri = response.meta['uri']
        log.msg(u'got response of query %s' % uri)

        sel = Selector(response)
        try:
            total = int(
                sel.xpath(
                    '//*[@id="wrapper"]//span[@class="count-badge"]/text()').
                re(r'\d+')[0])
            arts = list(parse_user_arts(sel))[:self.max_arts]
            if not arts and total > 0:
                return failed(query, 'data inconsist', response=response)
            return Page(
                query=query,
                uri=uri,
                total=total,
                arts=arts,
            )
        except:
            return failed(query, traceback.format_exc(), response=response)
Exemplo n.º 29
0
 def parse(self, response):
     query = response.meta['query']
     try:
         data = dict(
             status=response.status,
             headers=dict(response.headers),
             body=base64.b64encode(response.body)
         )
         payload = response.meta.get('payload')
         if payload:
             data['payload'] = payload
         return Response(
             data=data,
             query=query
         )
     except:
         return failed(query, traceback.format_exc(), response=response)
Exemplo n.º 30
0
def checkbusy(f):
    check = lambda response: md5(response.body).hexdigest(
    ) in BUSY_BODY_MD5_LIST
    make = lambda response: failed(
        response.meta['query'], 'pixiv busy', expected=True)
    if inspect.isgeneratorfunction(f):

        def inner(self, response):
            if check(response):
                yield make()
                return
            for ret in f(self, response):
                yield ret
    else:

        def inner(self, response):
            return make() if check(response) else f(self, response)

    return inner