def parse_username(self, response): query = response.meta['query'] if response.status == 404: return failed(query, '404') sel = Selector(response) try: if not sel.xpath('//div[@class="_no-item"]'): items = list(sel.xpath('//li[@class="user-recommendation-item"]')) if len(items) == 0: return failed(query, 'inconsist username search result', response=response) if len(items) == 1: user_id = items[0].xpath('.//a[@class="title"]/@href').extract()[0].split('=')[-1] check_user_id(user_id) return self._make_user_illustrations_uri_request( USER_ILLUSTRATIONS_URL_TEMPLATE % user_id, query ) return Request( SEARCH_USER_URL + '?' + urlencode({ 's_mode': 's_usr', 'nick': query['username'].encode('utf-8'), }), headers=self._make_headers(), callback=self.parse_username_recommendations, meta=dict(query=query), dont_filter=True, ) except: return failed(query, traceback.format_exc(), response=response)
def parse_list(self, response): query = response.meta['query'] if busy(response.body_as_unicode()): return failed(query, 'tora busy', expected=True) uri = response.meta['uri'] log.msg(u'got response of query %s' % query, level=log.INFO) if empty(response.body_as_unicode()): log.msg('empty result', level=log.INFO) return Page(query=query, uri=uri, total=0, arts=[]) def gen(trs): for tr in trs[2:-1:2][:MAX_ARTS]: yield Art( title=tr.xpath('td[@class="c1"]/a/text()').extract()[0], author=tr.xpath('td[@class="c2"]/a/text()').extract()[0], company=tr.xpath('td[@class="c3"]/a/text()').extract()[0], uri=urljoin( BASE_URL, tr.xpath('td[@class="c1"]/a/@href').extract()[0]), status='reserve' if u'予' in tr.xpath('td[@class="c7"]/text()').extract() else 'other', ) sel = Selector(response) try: trs = list(sel.xpath('//table[@class="FixFrame"]//tr')) return Page(query=query, uri=uri, total=total(sel), arts=list(gen(trs))) except: return failed(query, traceback.format_exc(), response=response)
def parse_complex_list(self, response): query = response.meta['query'] if busy(response.body_as_unicode()): yield failed(query, 'tora busy', expected=True) return uri = response.meta['uri'] log.msg(u'got response of query %s' % query, level=log.INFO) if empty(response.body_as_unicode()): log.msg('empty result', level=log.INFO) yield Page(query=query, uri=uri, total=0, arts=[]) return sel = Selector(response) try: uris = [ urljoin(BASE_URL, url) for url in sel.xpath( '//tr[@class="TBLdtil"]/td[@class="noi_c2"]/a/@href'). extract()[:MAX_ARTS] ] log.msg('got %d arts' % len(uris)) page = Page(query=query, uri=uri, total=total_complex(sel), arts=[None] * len(uris)) ranks = {uri: i for i, uri in enumerate(uris)} for uri in uris: req = self.make_art_request(uri) req.meta['page'] = page req.meta['ranks'] = ranks yield req except: yield failed(query, traceback.format_exc(), response=response)
def parse_art(self, response): uri = response.meta['uri'] if busy(response.body_as_unicode()): return failed(uri, 'tora busy', expected=True) sel = Selector(response) try: art = Art( title=sel.xpath('//td[@class="td_title_bar_r1c2"]/text()').extract()[0], author=sel.xpath('//td[@class="DetailData_L"]/a[contains(@href, "author")]/text()').extract(), company=sel.xpath('//td[@class="CircleName"]/a[1]/text()').extract()[0], uri=uri, status=status_in_art(sel), ) except: return failed(uri, traceback.format_exc(), response=response) if 'page' not in response.meta: return Page( query=uri, uri=uri, total=1, arts=[art] ) else: page = response.meta['page'] ranks = response.meta['ranks'] page['arts'][ranks[uri]] = art if page_complete(page): return page
def parse_art(self, response): uri = response.meta['uri'] if busy(response.body_as_unicode()): return failed(uri, 'tora busy', expected=True) sel = Selector(response) try: art = Art( title=sel.xpath( '//td[@class="td_title_bar_r1c2"]/text()').extract()[0], author=sel.xpath( '//td[@class="DetailData_L"]/a[contains(@href, "author")]/text()' ).extract(), company=sel.xpath( '//td[@class="CircleName"]/a[1]/text()').extract()[0], uri=uri, status=status_in_art(sel), ) except: return failed(uri, traceback.format_exc(), response=response) if 'page' not in response.meta: return Page(query=uri, uri=uri, total=1, arts=[art]) else: page = response.meta['page'] ranks = response.meta['ranks'] page['arts'][ranks[uri]] = art if page_complete(page): return page
def parse_username(self, response): query = response.meta['query'] if response.status == 404: return failed(query, '404') sel = Selector(response) try: if not sel.xpath('//div[@class="_no-item"]'): items = list( sel.xpath('//li[@class="user-recommendation-item"]')) if len(items) == 0: return failed(query, 'inconsist username search result', response=response) if len(items) == 1: user_id = items[0].xpath('.//a[@class="title"]/@href' ).extract()[0].split('=')[-1] check_user_id(user_id) return self._make_user_illustrations_uri_request( USER_ILLUSTRATIONS_URL_TEMPLATE % user_id, query) return Request( SEARCH_USER_URL + '?' + urlencode({ 's_mode': 's_usr', 'nick': query['username'].encode('utf-8'), }), headers=self._make_headers(), callback=self.parse_username_recommendations, meta=dict(query=query), dont_filter=True, ) except: return failed(query, traceback.format_exc(), response=response)
def parse_tags(self, response): query = response.meta['query'] try: return Tags(query=query, content=json.loads(response.body_as_unicode())) except ValueError: return failed(query, 'yande.re busy', expected=True) except: return failed(query, traceback.format_exc(), response=response)
def parse_tags(self, response): query = response.meta['query'] try: return Tags( query=query, content=json.loads(response.body_as_unicode()) ) except ValueError: return failed(query, 'yande.re busy', expected=True) except: return failed(query, traceback.format_exc(), response=response)
def parse_username_recommendations(self, response): query = response.meta['query'] if response.status == 404: return failed(query, '404') sel = Selector(response) try: return SearchUserPage(query=query, uri=response.url, total=0, arts=[], recommendations=list( gen_recommendations(sel))) except: return failed(query, traceback.format_exc(), response=response)
def parse_username_recommendations(self, response): query = response.meta['query'] if response.status == 404: return failed(query, '404') sel = Selector(response) try: return SearchUserPage( query=query, uri=response.url, total=0, arts=[], recommendations=list(gen_recommendations(sel)) ) except: return failed(query, traceback.format_exc(), response=response)
def parse_gist(self, response): query = response.meta['query'] try: gist = Gist(uri=response.url, query=query, meta=json.loads(response.body_as_unicode()), files=[]) files = gist['meta'].get('files') if files is None: yield failed(query, response.body_as_unicode()) return for name, info in files.items(): yield self.make_file_request(name, info, gist) except: yield failed(query, traceback.format_exc(), response=response)
def parse_uri(self, response): query = response.meta['query'] try: feed = feedparser.parse(response.body) if feed.bozo: return failed(query, 'ill formed xml on line {}: {}'.format( feed.bozo_exception.getLineNumber(), feed.bozo_exception.getMessage()), response=response) return Feed(uri=response.url, query=query, data=json.loads(jsonpickle.encode(feed))) except: return failed(query, traceback.format_exc(), response=response)
def parse_uri(self, response): query = response.meta['query'] try: feed = feedparser.parse(response.body) if feed.bozo: return failed(query, 'ill formed xml on line {}: {}'.format( feed.bozo_exception.getLineNumber(), feed.bozo_exception.getMessage() ), response=response) return Feed( uri=response.url, query=query, data=json.loads(jsonpickle.encode(feed)) ) except: return failed(query, traceback.format_exc(), response=response)
def parse_gist(self, response): query = response.meta['query'] try: gist = Gist( uri=response.url, query=query, meta=json.loads(response.body_as_unicode()), files=[] ) files = gist['meta'].get('files') if files is None: yield failed(query, response.body_as_unicode()) return for name, info in files.items(): yield self.make_file_request(name, info, gist) except: yield failed(query, traceback.format_exc(), response=response)
def parse_file(self, response): gist = response.meta['gist'] try: gist['files'].append(File( name=response.meta['name'], content=base64.b64encode(response.body) )) if len(gist['files']) == len(gist['meta']['files']): return gist except: return failed(gist['query'], traceback.format_exc(), response=response)
def parse_posts(self, response): query = response.meta['query'] uri = response.meta['uri'] try: return Posts(query=query, uri=uri, posts=json.loads(response.body_as_unicode())) except ValueError as e: return failed(query, 'yande.re busy', expected=True) except Exception as e: return self.failed(query, str(e))
def parse_file(self, response): gist = response.meta['gist'] try: gist['files'].append( File(name=response.meta['name'], content=base64.b64encode(response.body))) if len(gist['files']) == len(gist['meta']['files']): return gist except: return failed(gist['query'], traceback.format_exc(), response=response)
def parse_ranking_uri(self, response): query = response.meta['query'] if response.status == 404: return failed(query, '404') try: pages = response.meta['pages'] try: d = json.loads(response.body_as_unicode()) except ValueError: return failed(query, 'pixiv busy', expected=True) pages[response.meta['page']] = [] if 'error' in d else d['contents'] if None not in pages: arts = list(chain(*pages)) return Page( query=query, uri=make_ranking_uri(query), total=len(arts), arts=arts, ) except: return failed(query, traceback.format_exc(), response=response)
def parse_user_illustrations_uri(self, response): query = response.meta['query'] if response.status == 404: return failed(query, '404', expected=True) uri = response.meta['uri'] log.msg(u'got response of query %s' % uri) sel = Selector(response) try: total = int(sel.xpath('//*[@id="wrapper"]//span[@class="count-badge"]/text()').re(r'\d+')[0]) arts = list(parse_user_arts(sel))[:self.max_arts] if not arts and total > 0: return failed(query, 'data inconsist', response=response) return Page( query=query, uri=uri, total=total, arts=arts, ) except: return failed(query, traceback.format_exc(), response=response)
def parse(self, response): query = response.meta['query'] try: data = dict(status=response.status, headers=dict(response.headers), body=base64.b64encode(response.body)) payload = response.meta.get('payload') if payload: data['payload'] = payload return Response(data=data, query=query) except: return failed(query, traceback.format_exc(), response=response)
def parse(self, response): query = response.meta['query'] try: sel = Selector(response) return Page( uri=response.url, query=query, posts=[ make_post(sub) for sub in sel.xpath( '//table[@class="itg"]/tr[starts-with(@class, "gtr")]') ]) except: return failed(query, traceback.format_exc(), response=response)
def parse(self, response): query = response.meta['query'] try: sel = Selector(response) return Page( uri=response.url, query=query, posts=[make_post(sub) for sub in sel.xpath( '//table[@class="itg"]/tr[starts-with(@class, "gtr")]' )] ) except: return failed(query, traceback.format_exc(), response=response)
def parse_ranking_uri(self, response): query = response.meta['query'] if response.status == 404: return failed(query, '404') try: pages = response.meta['pages'] try: d = json.loads(response.body_as_unicode()) except ValueError: return failed(query, 'pixiv busy', expected=True) pages[ response.meta['page']] = [] if 'error' in d else d['contents'] if None not in pages: arts = list(chain(*pages)) return Page( query=query, uri=make_ranking_uri(query), total=len(arts), arts=arts, ) except: return failed(query, traceback.format_exc(), response=response)
def parse_posts(self, response): query = response.meta['query'] uri = response.meta['uri'] try: return Posts( query=query, uri=uri, posts=json.loads(response.body_as_unicode()) ) except ValueError as e: return failed(query, 'yande.re busy', expected=True) except Exception as e: return self.failed(query, str(e))
def checkbusy(f): check = lambda response: md5(response.body).hexdigest() in BUSY_BODY_MD5_LIST make = lambda response: failed(response.meta['query'], 'pixiv busy', expected=True) if inspect.isgeneratorfunction(f): def inner(self, response): if check(response): yield make() return for ret in f(self, response): yield ret else: def inner(self, response): return make() if check(response) else f(self, response) return inner
def parse_list(self, response): query = response.meta['query'] if busy(response.body_as_unicode()): return failed(query, 'tora busy', expected=True) uri = response.meta['uri'] log.msg(u'got response of query %s' % query, level=log.INFO) if empty(response.body_as_unicode()): log.msg('empty result', level=log.INFO) return Page( query=query, uri=uri, total=0, arts=[] ) def gen(trs): for tr in trs[2:-1:2][:MAX_ARTS]: yield Art( title=tr.xpath('td[@class="c1"]/a/text()').extract()[0], author=tr.xpath('td[@class="c2"]/a/text()').extract()[0], company=tr.xpath('td[@class="c3"]/a/text()').extract()[0], uri=urljoin(BASE_URL, tr.xpath('td[@class="c1"]/a/@href').extract()[0]), status='reserve' if u'予' in tr.xpath('td[@class="c7"]/text()').extract() else 'other', ) sel = Selector(response) try: trs = list(sel.xpath('//table[@class="FixFrame"]//tr')) return Page( query=query, uri=uri, total=total(sel), arts=list(gen(trs)) ) except: return failed(query, traceback.format_exc(), response=response)
def parse_complex_list(self, response): query = response.meta['query'] if busy(response.body_as_unicode()): yield failed(query, 'tora busy', expected=True) return uri = response.meta['uri'] log.msg(u'got response of query %s' % query, level=log.INFO) if empty(response.body_as_unicode()): log.msg('empty result', level=log.INFO) yield Page( query=query, uri=uri, total=0, arts=[] ) return sel = Selector(response) try: uris = [urljoin(BASE_URL, url) for url in sel.xpath('//tr[@class="TBLdtil"]/td[@class="noi_c2"]/a/@href').extract()[:MAX_ARTS]] log.msg('got %d arts' % len(uris)) page = Page( query=query, uri=uri, total=total_complex(sel), arts=[None] * len(uris) ) ranks = {uri: i for i, uri in enumerate(uris)} for uri in uris: req = self.make_art_request(uri) req.meta['page'] = page req.meta['ranks'] = ranks yield req except: yield failed(query, traceback.format_exc(), response=response)
def parse_user_illustrations_uri(self, response): query = response.meta['query'] if response.status == 404: return failed(query, '404', expected=True) uri = response.meta['uri'] log.msg(u'got response of query %s' % uri) sel = Selector(response) try: total = int( sel.xpath( '//*[@id="wrapper"]//span[@class="count-badge"]/text()'). re(r'\d+')[0]) arts = list(parse_user_arts(sel))[:self.max_arts] if not arts and total > 0: return failed(query, 'data inconsist', response=response) return Page( query=query, uri=uri, total=total, arts=arts, ) except: return failed(query, traceback.format_exc(), response=response)
def parse(self, response): query = response.meta['query'] try: data = dict( status=response.status, headers=dict(response.headers), body=base64.b64encode(response.body) ) payload = response.meta.get('payload') if payload: data['payload'] = payload return Response( data=data, query=query ) except: return failed(query, traceback.format_exc(), response=response)
def checkbusy(f): check = lambda response: md5(response.body).hexdigest( ) in BUSY_BODY_MD5_LIST make = lambda response: failed( response.meta['query'], 'pixiv busy', expected=True) if inspect.isgeneratorfunction(f): def inner(self, response): if check(response): yield make() return for ret in f(self, response): yield ret else: def inner(self, response): return make() if check(response) else f(self, response) return inner