def lxmlize(url, encoding="utf-8", user_agent=requests.utils.default_user_agent()): scraper = Scrapelib(follow_robots=False, requests_per_minute=0) scraper.user_agent = user_agent entry = scraper.urlopen(url) if encoding != "utf-8" or not isinstance(entry, unicode): entry = entry.encode(encoding) page = lxml.html.fromstring(entry) meta = page.xpath('//meta[@http-equiv="refresh"]') if meta: _, url = meta[0].attrib["content"].split("=", 1) return lxmlize(url, encoding) else: page.make_links_absolute(url) return page
def lxmlize(url, encoding='utf-8', user_agent=requests.utils.default_user_agent()): scraper = Scrapelib(follow_robots=False, requests_per_minute=0) scraper.user_agent = user_agent entry = scraper.urlopen(url) if encoding != 'utf-8' or not isinstance(entry, unicode): entry = entry.encode(encoding) page = lxml.html.fromstring(entry) meta = page.xpath('//meta[@http-equiv="refresh"]') if meta: _, url = meta[0].attrib['content'].split('=', 1) return lxmlize(url, encoding) else: page.make_links_absolute(url) return page
def newfunc( header: typing.List[str], retries: int, retry_wait: int, rpm: int, timeout: int, user_agent: str, verbosity: int, verify: bool, fastmode: bool, **kwargs: str, ) -> None: scraper = Scraper( requests_per_minute=rpm, retry_attempts=retries, retry_wait_seconds=retry_wait, verify=verify, ) scraper.timeout = timeout scraper.user_agent = user_agent # only update headers, don't overwrite defaults scraper.headers.update( {k.strip(): v.strip() for k, v in [h.split(":") for h in header]}) if fastmode: scraper.cache_storage = SQLiteCache("spatula-cache.db") scraper.cache_write_only = False if verbosity == -1: level = logging.INFO if func.__name__ != "test" else logging.DEBUG elif verbosity == 0: # pragma: no cover level = logging.ERROR elif verbosity == 1: # pragma: no cover level = logging.INFO elif verbosity >= 2: # pragma: no cover level = logging.DEBUG if verbosity < 3: # replace parent library logging logging.getLogger("scrapelib").setLevel(logging.ERROR) logging.getLogger("urllib3").setLevel(logging.WARNING) logging.basicConfig(level=level) return func(**kwargs, scraper=scraper)