def _crawl(self, url, **kwargs): task = {} if kwargs.get('callback'): callback = kwargs['callback'] if isinstance(callback, basestring) and hasattr(self, callback): func = getattr(self, callback) elif hasattr(callback, 'im_self') and callback.im_self is self: func = callback kwargs['callback'] = func.__name__ else: raise NotImplementedError("self.%s() not implemented!" % callback) if hasattr(func, '_config'): for k, v in func._config.iteritems(): kwargs.setdefault(k, v) if hasattr(self, 'crawl_config'): for k, v in self.crawl_config.iteritems(): kwargs.setdefault(k, v) url = quote_chinese(_build_url(url.strip(), kwargs.get('params'))) if kwargs.get('files'): assert isinstance(kwargs.get('data', {}), dict), "data must be a dict when using with files!" content_type, data = _encode_multipart_formdata(kwargs.get('data', {}), kwargs.get('files', {})) kwargs.setdefault('headers', {}) kwargs['headers']['Content-Type'] = content_type kwargs['data'] = data if kwargs.get('data'): kwargs['data'] = _encode_params(kwargs['data']) if kwargs.get('data'): kwargs.setdefault('method', 'POST') schedule = {} for key in ('priority', 'retries', 'exetime', 'age', 'itag', 'force_update'): if key in kwargs and kwargs[key] is not None: schedule[key] = kwargs[key] if schedule: task['schedule'] = schedule fetch = {} for key in ('method', 'headers', 'data', 'timeout', 'allow_redirects', 'cookies', 'proxy', 'etag', 'last_modifed', 'save', 'js_run_at', 'js_script', 'load_images', 'fetch_type'): if key in kwargs and kwargs[key] is not None: fetch[key] = kwargs[key] if fetch: task['fetch'] = fetch process = {} for key in ('callback', ): if key in kwargs and kwargs[key] is not None: process[key] = kwargs[key] if process: task['process'] = process task['project'] = self.project_name task['url'] = url task['taskid'] = task.get('taskid') or md5string(url) self._follows.append(task) return task
def _crawl(self, url, **kwargs): """ real crawl API checking kwargs, and repack them to each sub-dict """ task = {} assert len(url) < 1024, "Maximum (1024) URL length error." if kwargs.get('callback'): callback = kwargs['callback'] if isinstance(callback, six.string_types) and hasattr(self, callback): func = getattr(self, callback) elif six.callable(callback) and six.get_method_self(callback) is self: func = callback kwargs['callback'] = func.__name__ else: raise NotImplementedError("self.%s() not implemented!" % callback) if hasattr(func, '_config'): for k, v in iteritems(func._config): if isinstance(v, dict) and isinstance(kwargs.get(k), dict): kwargs[k].update(v) else: kwargs.setdefault(k, v) url = quote_chinese(_build_url(url.strip(), kwargs.pop('params', None))) if kwargs.get('files'): assert isinstance( kwargs.get('data', {}), dict), "data must be a dict when using with files!" content_type, data = _encode_multipart_formdata(kwargs.pop('data', {}), kwargs.pop('files', {})) kwargs.setdefault('headers', {}) kwargs['headers']['Content-Type'] = content_type kwargs['data'] = data if kwargs.get('data'): kwargs['data'] = _encode_params(kwargs['data']) if kwargs.get('data'): kwargs.setdefault('method', 'POST') schedule = {} for key in self.schedule_fields: if key in kwargs: schedule[key] = kwargs.pop(key) elif key in self.crawl_config: schedule[key] = self.crawl_config[key] task['schedule'] = schedule fetch = {} for key in self.fetch_fields: if key in kwargs: fetch[key] = kwargs.pop(key) task['fetch'] = fetch process = {} for key in self.process_fields: if key in kwargs: process[key] = kwargs.pop(key) task['process'] = process task['project'] = self.project_name task['url'] = url if 'taskid' in kwargs: task['taskid'] = kwargs.pop('taskid') else: task['taskid'] = self.get_taskid(task) if kwargs: raise TypeError('crawl() got unexpected keyword argument: %s' % kwargs.keys()) if self.is_debugger(): task = self.task_join_crawl_config(task, self.crawl_config) if task['fetch'].get('proxy', False) and task['fetch'].get('fetch_type', None) in ('js', 'phantomjs') \ and not hasattr(self, '_proxy_warning'): self.logger.warning('phantomjs does not support specify proxy from script, use phantomjs args instead') self._proxy_warning = True cache_key = "%(project)s:%(taskid)s" % task if cache_key not in self._follows_keys: self._follows_keys.add(cache_key) self._follows.append(task) return task
def _crawl(self, url, **kwargs): """ real crawl API checking kwargs, and repack them to each sub-dict """ task = {} if kwargs.get('callback'): callback = kwargs['callback'] if isinstance(callback, six.string_types) and hasattr(self, callback): func = getattr(self, callback) elif six.callable(callback) and six.get_method_self(callback) is self: func = callback kwargs['callback'] = func.__name__ else: raise NotImplementedError("self.%s() not implemented!" % callback) if hasattr(func, '_config'): for k, v in iteritems(func._config): kwargs.setdefault(k, v) for k, v in iteritems(self.crawl_config): kwargs.setdefault(k, v) url = quote_chinese(_build_url(url.strip(), kwargs.get('params'))) if kwargs.get('files'): assert isinstance( kwargs.get('data', {}), dict), "data must be a dict when using with files!" content_type, data = _encode_multipart_formdata(kwargs.get('data', {}), kwargs.get('files', {})) kwargs.setdefault('headers', {}) kwargs['headers']['Content-Type'] = content_type kwargs['data'] = data if kwargs.get('data'): kwargs['data'] = _encode_params(kwargs['data']) if kwargs.get('data'): kwargs.setdefault('method', 'POST') schedule = {} for key in ('priority', 'retries', 'exetime', 'age', 'itag', 'force_update'): if key in kwargs and kwargs[key] is not None: schedule[key] = kwargs[key] task['schedule'] = schedule fetch = {} for key in ( 'method', 'headers', 'data', 'timeout', 'allow_redirects', 'cookies', 'proxy', 'etag', 'last_modifed', 'save', 'js_run_at', 'js_script', 'load_images', 'fetch_type' ): if key in kwargs and kwargs[key] is not None: fetch[key] = kwargs[key] task['fetch'] = fetch process = {} for key in ('callback', ): if key in kwargs and kwargs[key] is not None: process[key] = kwargs[key] task['process'] = process task['project'] = self.project_name task['url'] = url task['taskid'] = task.get('taskid') or self.get_taskid(task) cache_key = "%(project)s:%(taskid)s" % task if cache_key not in self._follows_keys: self._follows_keys.add(cache_key) self._follows.append(task) return task
def _crawl(self, url, **kwargs): """ real crawl API checking kwargs, and repack them to each sub-dict """ task = {} assert len(url) < 1024, "Maximum (1024) URL length error." if kwargs.get('callback'): callback = kwargs['callback'] if isinstance(callback, six.string_types) and hasattr( self, callback): func = getattr(self, callback) elif six.callable( callback) and six.get_method_self(callback) is self: func = callback kwargs['callback'] = func.__name__ else: raise NotImplementedError("self.%s() not implemented!" % callback) if hasattr(func, '_config'): for k, v in iteritems(func._config): if isinstance(v, dict) and isinstance(kwargs.get(k), dict): kwargs[k].update(v) else: kwargs.setdefault(k, v) for k, v in iteritems(self.crawl_config): if isinstance(v, dict) and isinstance(kwargs.get(k), dict): kwargs[k].update(v) else: kwargs.setdefault(k, v) url = quote_chinese(_build_url(url.strip(), kwargs.pop('params', None))) if kwargs.get('files'): assert isinstance( kwargs.get('data', {}), dict), "data must be a dict when using with files!" content_type, data = _encode_multipart_formdata( kwargs.pop('data', {}), kwargs.pop('files', {})) kwargs.setdefault('headers', {}) kwargs['headers']['Content-Type'] = content_type kwargs['data'] = data if kwargs.get('data'): kwargs['data'] = _encode_params(kwargs['data']) if kwargs.get('data'): kwargs.setdefault('method', 'POST') schedule = {} for key in ('priority', 'retries', 'exetime', 'age', 'itag', 'force_update', 'auto_recrawl'): if key in kwargs: schedule[key] = kwargs.pop(key) task['schedule'] = schedule fetch = {} for key in ('method', 'headers', 'data', 'timeout', 'allow_redirects', 'cookies', 'proxy', 'etag', 'last_modifed', 'last_modified', 'save', 'js_run_at', 'js_script', 'js_viewport_width', 'js_viewport_height', 'load_images', 'fetch_type', 'use_gzip', 'validate_cert', 'max_redirects', 'robots_txt'): if key in kwargs: fetch[key] = kwargs.pop(key) task['fetch'] = fetch process = {} for key in ('callback', ): if key in kwargs: process[key] = kwargs.pop(key) task['process'] = process task['project'] = self.project_name task['url'] = url if 'taskid' in kwargs: task['taskid'] = kwargs.pop('taskid') else: task['taskid'] = self.get_taskid(task) if kwargs: raise TypeError('crawl() got unexpected keyword argument: %s' % kwargs.keys()) cache_key = "%(project)s:%(taskid)s" % task if cache_key not in self._follows_keys: self._follows_keys.add(cache_key) self._follows.append(task) return task
def _crawl(self, url, **kwargs): """ real crawl API checking kwargs, and repack them to each sub-dict """ task = {} assert len(url) < 1024, "Maximum (1024) URL length error." if kwargs.get('callback'): callback = kwargs['callback'] if isinstance(callback, six.string_types) and hasattr(self, callback): func = getattr(self, callback) elif six.callable(callback) and six.get_method_self(callback) is self: func = callback kwargs['callback'] = func.__name__ else: raise NotImplementedError("self.%s() not implemented!" % callback) if hasattr(func, '_config'): for k, v in iteritems(func._config): if isinstance(v, dict) and isinstance(kwargs.get(k), dict): kwargs[k].update(v) else: kwargs.setdefault(k, v) for k, v in iteritems(self.crawl_config): if isinstance(v, dict) and isinstance(kwargs.get(k), dict): kwargs[k].update(v) else: kwargs.setdefault(k, v) url = quote_chinese(_build_url(url.strip(), kwargs.pop('params', None))) if kwargs.get('files'): assert isinstance( kwargs.get('data', {}), dict), "data must be a dict when using with files!" content_type, data = _encode_multipart_formdata(kwargs.pop('data', {}), kwargs.pop('files', {})) kwargs.setdefault('headers', {}) kwargs['headers']['Content-Type'] = content_type kwargs['data'] = data if kwargs.get('data'): kwargs['data'] = _encode_params(kwargs['data']) if kwargs.get('data'): kwargs.setdefault('method', 'POST') schedule = {} for key in ('priority', 'retries', 'exetime', 'age', 'itag', 'force_update', 'auto_recrawl'): if key in kwargs: schedule[key] = kwargs.pop(key) task['schedule'] = schedule fetch = {} for key in ( 'method', 'headers', 'data', 'timeout', 'allow_redirects', 'cookies', 'proxy', 'etag', 'last_modifed', 'save', 'js_run_at', 'js_script', 'js_viewport_width', 'js_viewport_height', 'load_images', 'fetch_type', 'use_gzip', 'validate_cert', 'max_redirects' ): if key in kwargs: fetch[key] = kwargs.pop(key) task['fetch'] = fetch process = {} for key in ('callback', ): if key in kwargs: process[key] = kwargs.pop(key) task['process'] = process task['project'] = self.project_name task['url'] = url if 'taskid' in kwargs: task['taskid'] = kwargs.pop('taskid') else: task['taskid'] = self.get_taskid(task) if kwargs: raise TypeError('crawl() got unexpected keyword argument: %s' % kwargs.keys()) cache_key = "%(project)s:%(taskid)s" % task if cache_key not in self._follows_keys: self._follows_keys.add(cache_key) self._follows.append(task) return task
def http_fetch(self, url, task): '''HTTP fetcher''' start_time = time.time() self.on_fetch('http', task) handle_error = lambda x: self.handle_error('http', url, task, start_time, x) # setup request parameters fetch = self.pack_tornado_request_parameters(url, task) task_fetch = task.get('fetch', {}) session = cookies.RequestsCookieJar() # fix for tornado request obj if 'Cookie' in fetch['headers']: c = http_cookies.SimpleCookie() try: c.load(fetch['headers']['Cookie']) except AttributeError: c.load(utils.utf8(fetch['headers']['Cookie'])) for key in c: session.set(key, c[key]) del fetch['headers']['Cookie'] if 'cookies' in fetch: session.update(fetch['cookies']) del fetch['cookies'] max_redirects = task_fetch.get('max_redirects', 5) # we will handle redirects by hand to capture cookies fetch['follow_redirects'] = False # making requests while True: # robots.txt if task_fetch.get('robots_txt', False): can_fetch = yield self.can_fetch( fetch['headers']['User-Agent'], fetch['url']) if not can_fetch: error = tornado.httpclient.HTTPError( 403, 'Disallowed by robots.txt') raise gen.Return(handle_error(error)) try: request = tornado.httpclient.HTTPRequest(**fetch) # if cookie already in header, get_cookie_header wouldn't work old_cookie_header = request.headers.get('Cookie') if old_cookie_header: del request.headers['Cookie'] cookie_header = cookies.get_cookie_header(session, request) if cookie_header: request.headers['Cookie'] = cookie_header elif old_cookie_header: request.headers['Cookie'] = old_cookie_header except Exception as e: logger.exception(fetch) raise gen.Return(handle_error(e)) try: response = yield gen.maybe_future( self.http_client.fetch(request)) except tornado.httpclient.HTTPError as e: if e.response: response = e.response else: raise gen.Return(handle_error(e)) extract_cookies_to_jar(session, response.request, response.headers) if (response.code in (301, 302, 303, 307) and response.headers.get('Location') and task_fetch.get('allow_redirects', True)): if max_redirects <= 0: error = tornado.httpclient.HTTPError( 599, 'Maximum (%d) redirects followed' % task_fetch.get('max_redirects', 5), response) raise gen.Return(handle_error(error)) if response.code in (302, 303): fetch['method'] = 'GET' if 'body' in fetch: del fetch['body'] fetch['url'] = quote_chinese( urljoin(fetch['url'], response.headers['Location'])) fetch['request_timeout'] -= time.time() - start_time if fetch['request_timeout'] < 0: fetch['request_timeout'] = 0.1 max_redirects -= 1 continue result = {} result['orig_url'] = url result['content'] = response.body or '' result['headers'] = dict(response.headers) result['status_code'] = response.code result['url'] = response.effective_url or url result['time'] = time.time() - start_time result['cookies'] = session.get_dict() result['save'] = task_fetch.get('save') if response.error: result['error'] = utils.text(response.error) if 200 <= response.code < 300: logger.info("[%d] %s:%s %s %.2fs", response.code, task.get('project'), task.get('taskid'), url, result['time']) else: logger.warning("[%d] %s:%s %s %.2fs", response.code, task.get('project'), task.get('taskid'), url, result['time']) raise gen.Return(result)
def _crawl(self, url, **kwargs): """ real crawl API checking kwargs, and repack them to each sub-dict """ task = {} if kwargs.get('callback'): callback = kwargs['callback'] if isinstance(callback, six.string_types) and hasattr( self, callback): func = getattr(self, callback) elif six.callable( callback) and six.get_method_self(callback) is self: func = callback kwargs['callback'] = func.__name__ else: raise NotImplementedError("self.%s() not implemented!" % callback) if hasattr(func, '_config'): for k, v in iteritems(func._config): kwargs.setdefault(k, v) for k, v in iteritems(self.crawl_config): kwargs.setdefault(k, v) url = quote_chinese(_build_url(url.strip(), kwargs.get('params'))) if kwargs.get('files'): assert isinstance( kwargs.get('data', {}), dict), "data must be a dict when using with files!" content_type, data = _encode_multipart_formdata( kwargs.get('data', {}), kwargs.get('files', {})) kwargs.setdefault('headers', {}) kwargs['headers']['Content-Type'] = content_type kwargs['data'] = data if kwargs.get('data'): kwargs['data'] = _encode_params(kwargs['data']) if kwargs.get('data'): kwargs.setdefault('method', 'POST') schedule = {} for key in ('priority', 'retries', 'exetime', 'age', 'itag', 'force_update'): if key in kwargs and kwargs[key] is not None: schedule[key] = kwargs[key] task['schedule'] = schedule fetch = {} for key in ('method', 'headers', 'data', 'timeout', 'allow_redirects', 'cookies', 'proxy', 'etag', 'last_modifed', 'save', 'js_run_at', 'js_script', 'load_images', 'fetch_type'): if key in kwargs and kwargs[key] is not None: fetch[key] = kwargs[key] task['fetch'] = fetch process = {} for key in ('callback', ): if key in kwargs and kwargs[key] is not None: process[key] = kwargs[key] task['process'] = process task['project'] = self.project_name task['url'] = url task['taskid'] = task.get('taskid') or self.get_taskid(task) cache_key = "%(project)s:%(taskid)s" % task if cache_key not in self._follows_keys: self._follows_keys.add(cache_key) self._follows.append(task) return task
def _crawl(self, url, **kwargs): """ real crawl API checking kwargs, and repack them to each sub-dict """ task = {} assert len(url) < 1024, "Maximum (1024) URL length error." if kwargs.get("callback"): callback = kwargs["callback"] if isinstance(callback, six.string_types) and hasattr(self, callback): func = getattr(self, callback) elif six.callable(callback) and six.get_method_self(callback) is self: func = callback kwargs["callback"] = func.__name__ else: raise NotImplementedError("self.%s() not implemented!" % callback) if hasattr(func, "_config"): for k, v in iteritems(func._config): kwargs.setdefault(k, v) for k, v in iteritems(self.crawl_config): kwargs.setdefault(k, v) url = quote_chinese(_build_url(url.strip(), kwargs.pop("params", None))) if kwargs.get("files"): assert isinstance(kwargs.get("data", {}), dict), "data must be a dict when using with files!" content_type, data = _encode_multipart_formdata(kwargs.pop("data", {}), kwargs.pop("files", {})) kwargs.setdefault("headers", {}) kwargs["headers"]["Content-Type"] = content_type kwargs["data"] = data if kwargs.get("data"): kwargs["data"] = _encode_params(kwargs["data"]) if kwargs.get("data"): kwargs.setdefault("method", "POST") schedule = {} for key in ("priority", "retries", "exetime", "age", "itag", "force_update", "auto_recrawl"): if key in kwargs: schedule[key] = kwargs.pop(key) task["schedule"] = schedule fetch = {} for key in ( "method", "headers", "data", "timeout", "allow_redirects", "cookies", "proxy", "etag", "last_modifed", "save", "js_run_at", "js_script", "js_viewport_width", "js_viewport_height", "load_images", "fetch_type", "use_gzip", "validate_cert", ): if key in kwargs: fetch[key] = kwargs.pop(key) task["fetch"] = fetch process = {} for key in ("callback",): if key in kwargs: process[key] = kwargs.pop(key) task["process"] = process task["project"] = self.project_name task["url"] = url if "taskid" in kwargs: task["taskid"] = kwargs.pop("taskid") else: task["taskid"] = self.get_taskid(task) if kwargs: raise TypeError("crawl() got unexpected keyword argument: %s" % kwargs.keys()) cache_key = "%(project)s:%(taskid)s" % task if cache_key not in self._follows_keys: self._follows_keys.add(cache_key) self._follows.append(task) return task
def http_fetch(self, url, task): '''HTTP fetcher''' start_time = time.time() self.on_fetch('http', task) handle_error = lambda x: self.handle_error('http', url, task, start_time, x) # setup request parameters fetch = self.pack_tornado_request_parameters(url, task) task_fetch = task.get('fetch', {}) session = cookies.RequestsCookieJar() # fix for tornado request obj if 'Cookie' in fetch['headers']: c = http_cookies.SimpleCookie() try: c.load(fetch['headers']['Cookie']) except AttributeError: c.load(utils.utf8(fetch['headers']['Cookie'])) for key in c: session.set(key, c[key]) del fetch['headers']['Cookie'] if 'cookies' in fetch: session.update(fetch['cookies']) del fetch['cookies'] max_redirects = task_fetch.get('max_redirects', 5) # we will handle redirects by hand to capture cookies fetch['follow_redirects'] = False # making requests while True: # robots.txt if task_fetch.get('robots_txt', False): can_fetch = yield self.can_fetch(fetch['headers']['User-Agent'], fetch['url']) if not can_fetch: error = tornado.httpclient.HTTPError(403, 'Disallowed by robots.txt') raise gen.Return(handle_error(error)) try: request = tornado.httpclient.HTTPRequest(**fetch) # if cookie already in header, get_cookie_header wouldn't work old_cookie_header = request.headers.get('Cookie') if old_cookie_header: del request.headers['Cookie'] cookie_header = cookies.get_cookie_header(session, request) if cookie_header: request.headers['Cookie'] = cookie_header elif old_cookie_header: request.headers['Cookie'] = old_cookie_header except Exception as e: logger.exception(fetch) raise gen.Return(handle_error(e)) try: response = yield gen.maybe_future(self.http_client.fetch(request)) except tornado.httpclient.HTTPError as e: if e.response: response = e.response else: raise gen.Return(handle_error(e)) extract_cookies_to_jar(session, response.request, response.headers) if (response.code in (301, 302, 303, 307) and response.headers.get('Location') and task_fetch.get('allow_redirects', True)): if max_redirects <= 0: error = tornado.httpclient.HTTPError( 599, 'Maximum (%d) redirects followed' % task_fetch.get('max_redirects', 5), response) raise gen.Return(handle_error(error)) if response.code in (302, 303): fetch['method'] = 'GET' if 'body' in fetch: del fetch['body'] fetch['url'] = quote_chinese(urljoin(fetch['url'], response.headers['Location'])) fetch['request_timeout'] -= time.time() - start_time if fetch['request_timeout'] < 0: fetch['request_timeout'] = 0.1 max_redirects -= 1 continue result = {} result['orig_url'] = url result['content'] = response.body or '' result['headers'] = dict(response.headers) result['status_code'] = response.code result['url'] = response.effective_url or url result['time'] = time.time() - start_time result['cookies'] = session.get_dict() result['save'] = task_fetch.get('save') if response.error: result['error'] = utils.text(response.error) if 200 <= response.code < 300: logger.info("[%d] %s:%s %s %.2fs", response.code, task.get('project'), task.get('taskid'), url, result['time']) else: logger.warning("[%d] %s:%s %s %.2fs", response.code, task.get('project'), task.get('taskid'), url, result['time']) raise gen.Return(result)