def _build_response_data(self, req, response): encoding = 'utf8' unicode_html = u'' try: unicode_html = response['data'].decode(encoding, 'ignore') except Exception as e: logger.warn('failed to decode bytes from url: %s', req.url) return_type = req.get('return_type') or 'doc' if return_type == 'doc': doc = http.Doc(url=req.url, html=unicode_html) doc.req = req doc.status.code = response['code'] doc.status.message = response['message'] return doc elif return_type == 'html': html = common.DataItem(unicode_html) html.req = req html.status = common.DataObject() html.status.code = response['code'] html.status.message = response['message'] return html else: self.scraper.logger.warn('unsupported return_type: %s', return_type) return None
def save(self, record, filename='result.csv', max=None, keys=[], id=None, headers=[], remove_existing_file=True, always_quoted=True): #waiting while other thread writing while self.writingflag: pass #hold the flag self.writingflag = True path = os.path.join(self.dir, filename) format = common.DataItem(path).subreg('\.([a-z]{2,5})$--is').lower() if not self.outdb.get(path): if os.path.exists(path): if remove_existing_file: os.remove(path) self.outdb.update({ path: common.DataObject(cnt=0, data=[], ids=[], format=format) }) trackingobj = self.outdb.get(path) if keys or id: id = id or u"".join( [unicode(record[record.index(key) + 1]) for key in keys]) if id in trackingobj.ids: self.writingflag = False return else: trackingobj.ids.append(id) trackingobj.cnt += 1 if format == 'csv': #for csv format, save to file immediately common.save_csv(path, record, always_quoted=always_quoted) elif format in ['xls', 'xlsx']: #save for later trackingobj.data.append(record) if max and trackingobj.cnt == max: self.flush() #save output files and quit os._exit(1) #free the flag self.writingflag = False
def handler(doc): page = stats.page doc.page = page if verify: if not verify( common.DataObject(starturl=common.DataItem(url), page=page, doc=doc)): doc.ok = False logger.warn("invalid doc at page {0}".format(page)) logger.info('page %s', page) #download and parse details if detail: listings = detail( common.DataObject( starturl=common.DataItem(url), page=page, doc=doc)) if hasattr(detail, '__call__') else doc.q(detail) logger.info('details: %s', len(listings)) for listing in listings: self.downloader.put(Request(url=listing if isinstance( listing, basestring) else listing.nodevalue(), cb=parse_detail, meta=meta, **options), onhold=list_pages_first) done = False _nexturl = None _next_post = None if next: _nexturl = next( common.DataObject( starturl=common.DataItem(url), page=page, doc=doc)) if hasattr(next, '__call__') else ( next if next.startswith('http') else doc.x(next)) if next_post: if not next: #next is not provided, use the original url _nexturl = doc.url _next_post = next_post( common.DataObject( doc=doc, page=page, starturl=common.DataItem(url))) if hasattr( next_post, '__call__') else next_post if next_post: if _next_post: done = False else: done = True else: if not _nexturl: done = True else: done = False #if (next and _nexturl ) or (next_post and _next_post): if not done: #logger.debug('next_post: %s, _nexturl: %s', _next_post, _nexturl) stats.page += 1 if max_pages != 0 and stats.page > max_pages: done = True else: self.downloader.put( Request(_nexturl, _next_post, cb=handler, **options)) else: done = True if parse_list: parse_list(doc)
def pagin(self, url, next=None, post=None, next_post=None, parse_list=None, detail=None, parse_detail=None, cc=3, max_pages=0, list_pages_first=True, start_now=False, debug=True, verify=None, meta={}, **_options): if cc != self.downloader.cc: self.downloader.set_cc(cc) options = common.combine_dicts(self.config, _options) stats = common.DataObject(page=1) #apply scraper-level options def handler(doc): page = stats.page doc.page = page if verify: if not verify( common.DataObject(starturl=common.DataItem(url), page=page, doc=doc)): doc.ok = False logger.warn("invalid doc at page {0}".format(page)) logger.info('page %s', page) #download and parse details if detail: listings = detail( common.DataObject( starturl=common.DataItem(url), page=page, doc=doc)) if hasattr(detail, '__call__') else doc.q(detail) logger.info('details: %s', len(listings)) for listing in listings: self.downloader.put(Request(url=listing if isinstance( listing, basestring) else listing.nodevalue(), cb=parse_detail, meta=meta, **options), onhold=list_pages_first) done = False _nexturl = None _next_post = None if next: _nexturl = next( common.DataObject( starturl=common.DataItem(url), page=page, doc=doc)) if hasattr(next, '__call__') else ( next if next.startswith('http') else doc.x(next)) if next_post: if not next: #next is not provided, use the original url _nexturl = doc.url _next_post = next_post( common.DataObject( doc=doc, page=page, starturl=common.DataItem(url))) if hasattr( next_post, '__call__') else next_post if next_post: if _next_post: done = False else: done = True else: if not _nexturl: done = True else: done = False #if (next and _nexturl ) or (next_post and _next_post): if not done: #logger.debug('next_post: %s, _nexturl: %s', _next_post, _nexturl) stats.page += 1 if max_pages != 0 and stats.page > max_pages: done = True else: self.downloader.put( Request(_nexturl, _next_post, cb=handler, **options)) else: done = True if parse_list: parse_list(doc) ##### end of the handler function ################################################## #start the initial url self.downloader.put(Request(url, post, cb=handler, **options)) if start_now: self.downloader.start()