def save_link(self, url, dir='images', file_name='auto', format='jpg', prefix='', **_options): fn = '' if file_name == 'auto': #special name fn = common.DataItem(url).rr('\?.*?$').subreg( '/([^/\?\$]+\.[a-z]{2,4})$--is') if not fn: self.logger.warn('failed to parse file_name from url: %s', url) return None else: #file_name is a fixed name fn = file_name if not common.subreg(fn, '(\.[a-z]{2,5}$)--is'): fn += '.' + format fn = prefix + fn if not os.path.exists(os.path.join(self.dir, dir)): os.makedirs(os.path.join(self.dir, dir)) path = os.path.join(self.dir, dir, fn) if (os.path.exists(path)): return fn #already downloaded else: #start downloading the file options = common.combine_dicts(self.config, _options) res = self.client.fetch_data( http.Request(url=url, bin=True, **options)) if res.status.code == 200 and res.data: common.put_bin(path, res.data) return fn else: return None
def download_file(self, url, filename, dir='images', **_options): dir_path = self.join_path(dir) if not os.path.exists(dir_path): os.makedirs(dir_path) path = os.path.join(self.dir, dir, filename) if(os.path.exists(path)): return filename #already downloaded else: #start downloading the file options = common.combine_dicts(self.config, _options) res = self.client.fetch_data(http.Request(url=url, bin = True, **options)) if res.code == 200 and res.data: common.put_bin(path, res.data) return filename else: return None
def download_file(self, url, filename, dir='images', **_options): dir_path = self.join_path(dir) if not os.path.exists(dir_path): os.makedirs(dir_path) path = os.path.join(self.dir, dir, filename) if (os.path.exists(path)): return filename #already downloaded else: #start downloading the file options = common.combine_dicts(self.config, _options) res = self.client.fetch_data( http.Request(url=url, bin=True, **options)) if res.code == 200 and res.data: common.put_bin(path, res.data) return filename else: return None
def pagin(self, url, next=None, post=None, next_post=None, parse_list=None, detail=None, parse_detail=None, cc=3, max_pages=0, list_pages_first=True, start_now=False, debug=True, verify=None, meta={}, **_options): if cc != self.downloader.cc: self.downloader.set_cc(cc) options = common.combine_dicts(self.config, _options) stats = common.DataObject(page=1) #apply scraper-level options def handler(doc): page = stats.page doc.page = page if verify: if not verify( common.DataObject(starturl=common.DataItem(url), page=page, doc=doc)): doc.ok = False logger.warn("invalid doc at page {0}".format(page)) logger.info('page %s', page) #download and parse details if detail: listings = detail( common.DataObject( starturl=common.DataItem(url), page=page, doc=doc)) if hasattr(detail, '__call__') else doc.q(detail) logger.info('details: %s', len(listings)) for listing in listings: self.downloader.put(Request(url=listing if isinstance( listing, basestring) else listing.nodevalue(), cb=parse_detail, meta=meta, **options), onhold=list_pages_first) done = False _nexturl = None _next_post = None if next: _nexturl = next( common.DataObject( starturl=common.DataItem(url), page=page, doc=doc)) if hasattr(next, '__call__') else ( next if next.startswith('http') else doc.x(next)) if next_post: if not next: #next is not provided, use the original url _nexturl = doc.url _next_post = next_post( common.DataObject( doc=doc, page=page, starturl=common.DataItem(url))) if hasattr( next_post, '__call__') else next_post if next_post: if _next_post: done = False else: done = True else: if not _nexturl: done = True else: done = False #if (next and _nexturl ) or (next_post and _next_post): if not done: #logger.debug('next_post: %s, _nexturl: %s', _next_post, _nexturl) stats.page += 1 if max_pages != 0 and stats.page > max_pages: done = True else: self.downloader.put( Request(_nexturl, _next_post, cb=handler, **options)) else: done = True if parse_list: parse_list(doc) ##### end of the handler function ################################################## #start the initial url self.downloader.put(Request(url, post, cb=handler, **options)) if start_now: self.downloader.start()
def load_json(self, url, post=None, **_options): options = common.combine_dicts(self.config, _options) return self.client.load_json(Request(url=url, post=post, **options))
def normalize(self, scraper): """ normalize this req with using the provided scraper's config """ if self.is_normalized: return self self.scraper = scraper #copy scraper-wide options if not set yet self.options = common.combine_dicts(scraper.config, self.options) req = self self.url = common.normalize_url(self.url) # self.url = str(self.url) accept_error_codes = req.get('accept_error_codes') if accept_error_codes is None: accept_error_codes = [] req.set('accept_error_codes', accept_error_codes) #default headers user_agent = req.get('user_agent', agent.firefox ) #default agent is firefox if user_agent == 'random': user_agent = agent.random_agent() headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "User-Agent": user_agent, "Accept-Language": "en-us,en;q=0.5", "Accept-Encoding": "gzip, deflate", # "Connection": "close" #turn off keep-alive "Connection": "keep-alive" } if req.post: headers.update({"Content-Type": "application/x-www-form-urlencoded"}) #update user-passed in headers if req.get('headers'): if req.get('merge_headers') is not False: #merge user defined headers with default headers headers.update(req.get('headers')) else: #only use user defined headers headers = req.get('headers') req.set('headers', headers) proxy = req.get('proxy') or scraper.proxy_manager.get_proxy(req.url) if proxy and req.get('proxy_url_filter'): #check if this url is qualified for using proxy if not re.compile(req.get('proxy_url_filter')).findall(req.url): #failed proxy = '' logger.debug('proxy not used for url: %s', req.url) req.set('proxy', proxy) #normalise the post if req.post and isinstance(req.post, common.MyDict): req.post = req.post.dict() if req.post and isinstance(req.post, dict): req.post = urllib.urlencode(sorted(req.post.items())) self.is_normalized = True return self
def normalize(self, scraper): """ normalize this req with using the provided scraper's config """ if self.is_normalized: return self #copy scraper-wide options if not set yet self.options = common.combine_dicts(scraper.config, self.options) req = self self.url = common.normalize_url(self.url) # self.url = str(self.url) accept_error_codes = req.get('accept_error_codes') if accept_error_codes is None: accept_error_codes = [] req.set('accept_error_codes', accept_error_codes) #default headers user_agent = req.get('user_agent', agent.firefox ) #default agent is firefox if user_agent == 'random': user_agent = agent.random_agent() headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "User-Agent": user_agent, "Accept-Language": "en-us,en;q=0.5", "Accept-Encoding": "gzip, deflate", # "Connection": "close" #turn off keep-alive "Connection": "keep-alive" } if req.post: headers.update({"Content-Type": "application/x-www-form-urlencoded"}) #update user-passed in headers if req.get('headers'): if req.get('merge_headers') is not False: #merge user defined headers with default headers headers.update(req.get('headers')) else: #only use user defined headers headers = req.get('headers') req.set('headers', headers) proxy = req.get('proxy') or scraper.proxy_manager.get_proxy(req.url) if proxy and req.get('proxy_url_filter'): #check if this url is qualified for using proxy if not re.compile(req.get('proxy_url_filter')).findall(req.url): #failed proxy = '' logger.debug('proxy not used for url: %s', req.url) req.set('proxy', proxy) #normalise the post if req.post and isinstance(req.post, common.MyDict): req.post = req.post.dict() if req.post and isinstance(req.post, dict): req.post = urllib.urlencode(sorted(req.post.items())) self.is_normalized = True return self
def pagin(self, url, next=None, post=None,next_post=None, parse_list=None, detail= None, parse_detail= None, cc = 3, max_pages = 0, list_pages_first=True, start_now=False, debug=True, verify=None, meta={}, **_options): if cc != self.downloader.cc: self.downloader.set_cc(cc) options = common.combine_dicts(self.config, _options) stats = common.DataObject(page=1) #apply scraper-level options def handler(doc): page = stats.page doc.page = page if verify: if not verify(common.DataObject(starturl=common.DataItem(url), page= page, doc=doc)): doc.ok = False logger.warn("invalid doc at page {0}".format(page)) logger.info('page %s', page) #download and parse details if detail: listings = detail(common.DataObject(starturl=common.DataItem(url), page= page, doc=doc)) if hasattr(detail, '__call__') else doc.q(detail) logger.info('details: %s', len(listings) ) for listing in listings: self.downloader.put(Request(url= listing if isinstance(listing, basestring) else listing.nodevalue(), cb = parse_detail, meta=meta, **options), onhold=list_pages_first) done = False _nexturl = None _next_post = None if next: _nexturl = next(common.DataObject(starturl=common.DataItem(url), page= page, doc=doc)) if hasattr(next, '__call__') else ( next if next.startswith('http') else doc.x(next) ) if next_post: if not next: #next is not provided, use the original url _nexturl = doc.url _next_post = next_post(common.DataObject(doc=doc, page=page, starturl=common.DataItem(url))) if hasattr(next_post, '__call__') else next_post if next_post: if _next_post: done = False else: done = True else: if not _nexturl: done = True else: done = False #if (next and _nexturl ) or (next_post and _next_post): if not done: #logger.debug('next_post: %s, _nexturl: %s', _next_post, _nexturl) stats.page += 1 if max_pages != 0 and stats.page > max_pages: done = True else: self.downloader.put(Request(_nexturl, _next_post, cb= handler, **options)) else: done = True if parse_list: parse_list(doc) ##### end of the handler function ################################################## #start the initial url self.downloader.put(Request(url, post, cb= handler, **options)) if start_now: self.downloader.start()
def load_json(self, url, post=None, **_options): options = common.combine_dicts(self.config, _options) return self.client.load_json(Request(url = url, post = post, **options))