def skyscraper_service(): """Runs the skyscraper service which determines when spiders have to be executed and executes them""" if skyscraper.settings.PROMETHEUS_METRICS_PORT: prometheus_client.start_http_server( skyscraper.settings.PROMETHEUS_METRICS_PORT) else: logging.warning('PROMETHEUS_METRICS_PORT not defined, Prometheus' + ' metrics endpoint will not be available.') prometheus_num_configs = prometheus_client.Gauge( 'skyscraper_git_spiders', 'Number of spiders available in the Skyscraper git repository') proxy = None if os.environ.get('SKYSCRAPER_TOR_PROXY'): proxy = os.environ.get('SKYSCRAPER_TOR_PROXY') repo = skyscraper.git.DeclarativeRepository( skyscraper.settings.GIT_REPOSITORY, skyscraper.settings.GIT_WORKDIR, skyscraper.settings.GIT_SUBFOLDER, skyscraper.settings.GIT_BRANCH) spiderloader = skyscraper.spiderloader.GitSpiderLoader(repo) settings = get_project_settings() pipelines = [_load_pipeline(p) for p in settings.get('ITEM_PIPELINES')] if settings.get('SKYSCRAPER_CHROME_NO_SANDBOX'): browser = pyppeteer.launch(args=['--no-sandbox']) else: browser = pyppeteer.launch() crawler = skyscraper.execution.ChromeCrawler(settings, browser) spider_runners = { 'scrapy': skyscraper.execution.ScrapySpiderRunner(proxy), 'chrome': skyscraper.execution.ChromeSpiderRunner(crawler, spiderloader, pipelines), } runner = skyscraper.execution.SkyscraperRunner(spider_runners) try: while True: skyscraper.instrumentation.instrument_num_files() repo.update() configs = repo.get_all_configs() prometheus_num_configs.set(len(configs)) runner.update_spider_config(configs) logging.debug('Running due spiders') runner.run_due_spiders() time.sleep(15) except KeyboardInterrupt: print('Shutdown requested by user.')
def setUp(self): super().setUp() if 'MainThread' != threading.current_thread().name: asyncio.set_event_loop(asyncio.new_event_loop()) loop = asyncio.get_event_loop() try: self._browser = loop.run_until_complete(launch(headless=True, handleSIGINT=False, handleSIGTERM=False, handleSIGHUP=False, args=['--no-sandbox'] )) task = asyncio.ensure_future(self.browse(self._home_url)) # task.add_done_callback(self.callback) asyncio.get_event_loop().run_until_complete(task) except Exception as e: logger.error("asyncio error:%s", str(e), exc_info=True) raise e finally: try: logger.info('chromium closing') loop.run_until_complete(self._browser.close()) self._browser.process.communicate()#close FIFO pipe logger.info('chromium closed') finally: loop.close()#close socket logger.info('loop closed ')
def crawl_by_browser(self): if 'MainThread' != threading.current_thread().name: asyncio.set_event_loop(asyncio.new_event_loop()) loop = asyncio.get_event_loop() try: self._semaphore = asyncio.Semaphore(5) self._browser = loop.run_until_complete(launch(headless=True, handleSIGINT=False, handleSIGTERM=False, handleSIGHUP=False, args=['--no-sandbox'] )) tasks = [asyncio.ensure_future(self.browse(url)) for url in self.start_urls] # for t in tasks: # t.add_done_callback(self.callback) rst = loop.run_until_complete(asyncio.gather(*tasks)) except Exception as e: logger.error("asyncio error:%s", str(e), exc_info=True) raise e finally: try: logger.info('chromium closing') loop.run_until_complete(self._browser.close()) self._browser.process.communicate()#close FIFO pipe logger.info('chromium closed') finally: loop.close()#close socket logger.info('loop closed ')
def setUpClass(cls): cls.port = get_free_port() time.sleep(0.1) cls.app = get_application() cls.server = cls.app.listen(cls.port) cls.browser = launch(args=['--no-sandbox']) cls.page = sync(cls.browser.newPage())
async def _async_render(*, url: str, script: str = None, scrolldown, sleep: int, wait: float, reload, content: Optional[str], timeout: Union[float, int]): try: browser = pyppeteer.launch(headless=True, args=['--no-sandbox']) page = await browser.newPage() # Wait before rendering the page, to prevent timeouts. await asyncio.sleep(wait) # Load the given page (GET request, obviously.) if reload: await page.goto(url, options={'timeout': int(timeout * 1000)}) else: await page.goto(f'data:text/html,{self.html}', options={'timeout': int(timeout * 1000)}) result = None if script: result = await page.evaluate(script) if scrolldown: for _ in range(scrolldown): await page._keyboard.down('PageDown') await asyncio.sleep(sleep) else: await asyncio.sleep(sleep) if scrolldown: await page._keyboard.up('PageDown') # Return the content of the page, JavaScript evaluated. content = await page.content() return content, result except TimeoutError: return None
async def login(self): browser = await self.loop.create_task( launch(headless=False, userDataDir="./userData", ignoreHTTPSErrors=True, ignoreDefaultArgs=['--enable-automation'], loop=self.loop, args=[ "--disable-infobars", f"--window-size={D_WIDTH},{D_HEIGHT}", "--no-sandbox", '--incognito', '--ignore-certificate-errors', '--disable-setuid-sandbox' ])) page = await browser.newPage() await page.setViewport({"width": D_WIDTH, "height": D_HEIGHT}) await page.evaluateOnNewDocument( 'Object.defineProperty(navigator, "webdriver", {get:() => false})') await page.goto(BASE_URL, {"timeout": 1000 * 60}) await page.waitForSelector("div.login", options={"timeout": 60 * 1000}) iframes = page.frames for iframe in iframes: url = iframe.url if 'accounts.douban.com/passport/login_popup' in url: await iframe.click('li.account-tab-account') await iframe.type("#username", self.user_name, {'delay': random.randint(60, 121)}) await asyncio.sleep(random.randint(2, 4)) await iframe.type("#password", self.user_password, {'delay': random.randint(60, 121)}) await asyncio.sleep(random.randint(2, 4)) await iframe.click("div.account-form-field-submit") await asyncio.sleep(100)
async def _async_render(*, url: str, script: str = None, scrolldown, sleep: int): try: browser = pyppeteer.launch(headless=True) page = await browser.newPage() # Load the given page (GET request, obviously.) await page.goto(url) result = None if script: result = await page.evaluate(script) if scrolldown: for _ in range(scrolldown): await page._keyboard.down('PageDown') await asyncio.sleep(sleep) else: await asyncio.sleep(sleep) if scrolldown: await page._keyboard.up('PageDown') # Return the content of the page, JavaScript evaluated. content = await page.content() return content, result except TimeoutError: return None
def setUpClass(cls): cls.port = get_free_port() cls.url = 'http://localhost:{}/'.format(cls.port) cls.app = get_application() time.sleep(0.1) cls.server = cls.app.listen(cls.port) cls.browser = launch(args=['--no-sandbox'])
async def _async_render(*, url: str, script: str = None, scrolldown, sleep: int, wait: float, reload, content: Optional[str], timeout: Union[float, int]): try: browser = pyppeteer.launch(headless=True, args=['--no-sandbox']) page = await browser.newPage() # Wait before rendering the page, to prevent timeouts. await asyncio.sleep(wait) # Load the given page (GET request, obviously.) if reload: await page.goto(url, options={'timeout': int(timeout * 1000)}) else: await page.goto(f'data:text/html,{self.html}', options={'timeout': int(timeout * 1000)}) result = None if script: result = await page.evaluate(script) if scrolldown: for _ in range(scrolldown): await page._keyboard.down('PageDown') await asyncio.sleep(sleep) else: await asyncio.sleep(sleep) if scrolldown: await page._keyboard.up('PageDown') # Return the content of the page, JavaScript evaluated. content = await page.content() return content, result, page except TimeoutError: return None
def run(url): global loop global province_urls global home_url province_urls = [] browser = loop.run_until_complete( launch(headless=True, dumpio=True, args=['--no-sandbox', '--disable-setuid-sandbox'])) html_soup = loop.run_until_complete(load_page(browser, url, cache=False)) for div in html_soup.findAll('div', {'class': 'footer-nav'}): title_div = div.find('div', {'class': 'title'}) # if title_div: # print(title_div.text.strip()) if title_div and title_div.text.strip() in [ 'Provincies', 'Regionen', 'Steden', 'Provinces', 'Kantone' #, 'Cities', 'Communities', 'Cantons' ]: bprint.blue(title_div.text.strip()) for a in div.findAll('a', {'class': 'keywordslink'}): # if _should_parse_city_or_province(a.text): area_page_url = home_url + a['href'] print(area_page_url) province_urls.append(area_page_url) loop.run_until_complete(browser.close())
async def _async_render(*, url: str, script: str = None, scrolldown, sleep: int): try: browser = pyppeteer.launch(headless=True) page = await browser.newPage() # Load the given page (GET request, obviously.) await page.goto(url) result = None if script: result = await page.evaluate(script) if scrolldown: for _ in range(scrolldown): await page._keyboard.down('PageDown') await asyncio.sleep(sleep) else: await asyncio.sleep(sleep) if scrolldown: await page._keyboard.up('PageDown') # Return the content of the page, JavaScript evaluated. content = await page.content() return content, result except TimeoutError: return None
async def run_spotify(self): async def handle(request): spotify_connect_code = """ <script src="https://sdk.scdn.co/spotify-player.js"></script> <script> window.onSpotifyWebPlaybackSDKReady = () => { const token = '###SPOTIFY_ACCESS_TOKEN###'; const player = new Spotify.Player({ name: '###SPOTIFY_WEB_PLAYER_NAME###', getOAuthToken: cb => { cb(token); } }); // Error handling player.addListener('initialization_error', ({ message }) => { console.error(message); }); player.addListener('authentication_error', ({ message }) => { console.error(message); }); player.addListener('account_error', ({ message }) => { console.error(message); }); player.addListener('playback_error', ({ message }) => { console.error(message); }); // Playback status updates player.addListener('player_state_changed', state => { console.log(state); }); // Ready player.addListener('ready', ({ device_id }) => { console.log('Ready with Device ID', device_id); }); // Not Ready player.addListener('not_ready', ({ device_id }) => { console.log('Device ID has gone offline', device_id); }); // Connect to the player! player.connect(); }; </script> """ access_token = self.spotipy_pkce.get_cached_token() if access_token is not None: spotify_connect_code = spotify_connect_code.replace( "###SPOTIFY_ACCESS_TOKEN###", access_token["access_token"]) spotify_connect_code = spotify_connect_code.replace( "###SPOTIFY_WEB_PLAYER_NAME###", self._attr_player_name) return web.Response(text=spotify_connect_code) # start aiohttp server and run chrome headless self.app = web.Application() self.app.add_routes([web.get("/", handle)]) self.runner = web.AppRunner(self.app) await self.runner.setup() self.site = web.TCPSite(self.runner, "localhost", 8080) await self.site.start() self.browser = launch({ "ignoreDefaultArgs": ["--mute-audio"], "executablePath": "/usr/bin/chromium", }) page = await self.browser.newPage() await page.goto("http://localhost:8080/")
def browser(): global_loop = asyncio.get_event_loop() loop = asyncio.new_event_loop() try: return loop.run_until_complete(launch(**BROWSER_OPTIONS)) finally: # return old state to not affect outer code: asyncio.set_event_loop(global_loop)
def __init__(self, **args): """ init logger, loop, browser :param args: """ self.loop = asyncio.get_event_loop() self.browser = self.loop.run_until_complete( pyppeteer.launch(headless=True)) self.args = args
async def run_chrome(): settings = get_project_settings() settings['USER_NAMESPACE'] = namespace pipelines = [ _load_pipeline(p) for p in settings.get('ITEM_PIPELINES') ] if settings.get('SKYSCRAPER_CHROME_NO_SANDBOX'): browser = pyppeteer.launch(args=['--no-sandbox']) else: browser = pyppeteer.launch() crawler = skyscraper.execution.ChromeCrawler(settings, browser) runner = skyscraper.execution.ChromeSpiderRunner( crawler, spiderloader, pipelines) await runner.run(namespace, spider) await runner.close()
def _launch_browser(self): browser = yield from pyppeteer.launch({ 'headless': True, 'executablePath': '/usr/bin/chromium-browser', 'args': ['--no-sandbox'] }) for p in (yield from browser.pages()): yield from p.close() return browser
async def _async_render(url): try: browser = pyppeteer.launch() page = await browser.newPage() # Load the given page (GET request, obviously.) await page.goto(url) # Return the content of the page, JavaScript evaluated. return await page.content() except TimeoutError: return None
def __init__(self, config_file_path): # initialize browser self.browser = self.wait( launch({ "headless": True, "handleSIGINT": False })) # initialize class variable self.html_source = None self.dom = None self.initialized = False self.config = self.load_config(config_file_path)
def get_browser(self, pyppeteer_args=None): if pyppeteer_args is None: pyppeteer_args = {} if not hasattr(self, "_browser"): self.loop = asyncio.get_event_loop() browser_args = {'headless': True, 'args': ['--no-sandbox']} browser_args.update(pyppeteer_args) if 'browserWSEndpoint' in browser_args: self._browser = self.loop.run_until_complete( pyppeteer.connect(**browser_args)) else: self._browser = self.loop.run_until_complete( pyppeteer.launch(**browser_args)) return self._browser
def spider_opened(self, spider): PrintFormatUtil.print_line("spider {} : 开始处理".format(spider.name)) PrintFormatUtil.print_line("spider {} , 运行模式 {}".format(spider.name, spider.crawl_type.value)) if spider.crawl_type.value == 'selenium': chrome_options = Options() list([chrome_options.add_argument(x) for x in CONST.CHROME_DRIVER_OPTIONS]) self.driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=CONST.CHROME_DRIVER_BIN_PATH) if spider.crawl_type.value == 'puppeeter': pyppeteer_level = logging.WARNING logging.getLogger('pyppeteer').setLevel(pyppeteer_level) logging.getLogger('websockets.protocol').setLevel(pyppeteer_level) pyppeteer_logger = logging.getLogger('pyppeteer') pyppeteer_logger.setLevel(logging.WARNING) self.driver = sync(launch({'Headless': True, 'args': ['--no-sandbox', '--disable-gpu'], 'dumpio': True}))
def run_sub_areas(urls): global loop n = round(len(urls) / 100) bprint.green(f'{n+1} processes') for i in tqdm(range(n + 1)): browser = loop.run_until_complete( launch(headless=True, dumpio=True, args=['--no-sandbox', '--disable-setuid-sandbox'])) _urls = urls[i * 100:i * 100 + 100] tasks = [] for url in _urls: tasks.append(parse_sub_area(browser, url)) loop.run_until_complete(asyncio.gather(*tasks)) loop.run_until_complete(browser.close())
async def _async_render(url: str, script: str = None): try: browser = pyppeteer.launch(headless=True) page = await browser.newPage() # Load the given page (GET request, obviously.) await page.goto(url) if script: result = await page.evaluate(script) # Return the content of the page, JavaScript evaluated. content = await page.content() return content, result except TimeoutError: return None
def run(urls): global prefix n = round(len(urls)/100) loop = asyncio.get_event_loop() for i in tqdm(range(n+1)): browser = loop.run_until_complete(launch(headless = True, dumpio = True, args=['--no-sandbox', '--disable-setuid-sandbox'])) loop.run_until_complete(asyncio.sleep(3)) _urls = urls[i*100: i*100+100] tasks = [] for url in _urls: tasks.append(parse_restaurant(browser, url)) loop.run_until_complete(asyncio.gather(*tasks)) loop.run_until_complete(browser.close()) loop.close()
def _launch_browser(self): browser = yield from pyppeteer.launch({ 'headless': False, 'dumpio': True, 'args': [ '--disable-infobars', '--devtools=false', '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-accelerated-2d-canvas', '--no-first-run', '--no-zygote', '--disable-gpu', '--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36' ] }) # for p in (yield from browser.pages()): # yield from p.close() return browser
def run_provinces(urls): global loop global sub_area_urls global home_url sub_area_urls = [] browser = loop.run_until_complete( launch(headless=True, dumpio=True, args=['--no-sandbox', '--disable-setuid-sandbox'])) for url in urls: html_soup = loop.run_until_complete(load_page(browser, url)) sub_areas = html_soup.findAll('div', {'class': 'delarea'}) # headers['Referer'] = response.url for sub_area in sub_areas: for a in sub_area.find_all('a', href=True): sub_area_page_url = home_url + a['href'] # print (sub_area_page_url) sub_area_urls.append(sub_area_page_url) loop.run_until_complete(browser.close())
def start(self) -> None: """ Starts the crawler. This method will block until the crawler is finished. """ self._running = True self._browser = syncer.sync(pyppeteer.launch()) self._page = syncer.sync(self._browser.pages())[0] # about:blank page self._page_index = 0 self._add_page_listeners(self._page) self.on_start() self._run() self.on_stop() syncer.sync(self._page.close()) syncer.sync(self._browser.close()) self._running = False self._stop_initiated = False
def run(cls) -> None: # 入口点,调用协程异步爬取网页 loop = asyncio.get_event_loop() futures = [ launch( headless=False, # devtools=True, args=['--proxy-server=' + PROXY_SERVER]), cls.get_timestamp() ] browser = loop.run_until_complete(asyncio.gather(*futures))[0] futures = [ cls().get_other_news(browser), cls().get_finance_news(browser), cls().get_tech_news(browser), cls().get_sports_news(browser) ] loop.run_until_complete(asyncio.gather(*futures)) # print(result) loop.run_until_complete( asyncio.gather(cls.build_json(), browser.close()))
def test_simple(): """Test simple. @pytest.fixture(scope="function") async def ... @pytest.mark.asyncio ... wont work, always runtime errors: loop already closed """ try: loop = asyncio.get_event_loop() except Exception: # logger.error(exc) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) # browser = asyncio.run(pyppeteer.launch()) # page = asyncio.run(browser.newPage()) # 3s browser = loop.run_until_complete(pyppeteer.launch()) # also OK, 1.8s # browser = loop.run_until_complete(get_ppbrowser()) # 500 ms page = loop.run_until_complete(browser.newPage()) text = "test this and more" # res = asyncio.run(deepl_tr(text, page=page)) # fist time ~10s, 3.25s res = loop.run_until_complete(deepl_tr(text, page=page)) assert "试" in res loop.run_until_complete(page.close()) loop.run_until_complete(browser.close())
def run(urls): global prefix # try: # with open(f'{prefix}_results.txt', 'r') as f: # lines = f.readlines() # # lines = [x.strip('\n') for x in lines] # completed = [ast.literal_eval(x)['url'] for x in lines] # incomplete = list(set(urls) - set(completed)) # print (color.GREEN + f'{len(completed)} completed\n{len(incomplete)} incomplete' + color.END) # except FileNotFoundError: # print (color.RED + f'{prefix}_results.txt not found, running for the first time' + color.END) # # results = [] n = round(len(urls) / 100) loop = asyncio.get_event_loop() for i in tqdm(range(n + 1)): browser = loop.run_until_complete(launch(headless=True, dumpio=True)) loop.run_until_complete(asyncio.sleep(3)) _urls = urls[i * 100:i * 100 + 100] tasks = [] for url in _urls: tasks.append(parse_restaurant(browser, url)) loop.run_until_complete(asyncio.gather(*tasks)) loop.run_until_complete(browser.close()) loop.close()
def __init__(self, alias: str, downloader: Downloader = RenderDownloader(), redup: MemoryRedup = MemoryRedup(), scheduler: MemoryScheduler = MemoryScheduler(), parser: Parser = HtmlParser(), pipeline: ConsolePipeline = ConsolePipeline(), logger: ConsoleLogger = ConsoleLogger()): """ 实例化爬虫类,各个参数很重要,需要认真填写 :param alias: 网站名称,方便记忆 :param pattern: 运行模式,可选值有1、2。值1表示使用简洁模式,值2表示使用渲染模式 :param redup: 判断重复的类,必须创建对象,可选类有MemoryRedup、RedisRedup :param scheduler: 调度器类,必须创建对象,可选类有MemoryScheduler :param parser: 条目解析器 :param pipeline: 持久化的类,必须创建对象,可选类有ConsoleDao、FilePipeline、MySQLPipeline、WordPressPipeline :param logger: 日志类,必须创建对象,可选类有NoLogger、ConsoleLogger """ self.pid = os.getpid() self.logger = logger self.logger.tag(alias) self.alias = alias self.downloader = downloader self.redup = redup self.scheduler = scheduler self.parser = parser self.pipeline = pipeline self.browser = asyncio.get_event_loop().run_until_complete(launch({'headless': True, 'args': ['--no-sandbox', '--disable-setuid-sandbox'], 'dumpio': True, 'slowMo': 1})) self.browser_page = None self.event_loop = asyncio.get_event_loop() self.template = None #保存本页面对应的模板
def browser(self): if not hasattr(self, "_browser"): self.loop = asyncio.get_event_loop() self._browser = self.loop.run_until_complete(pyppeteer.launch(headless=True, args=['--no-sandbox'])) return self._browser
async def main(): browser = launch() page = await browser.newPage() await page.goto('http://kenh14.vn/ai-roi-cung-khac-cac-hot-girl-nay-cung-khong-ngoai-le-khi-vong-1-cu-ngay-cang-phong-phao-20171207193958533.chn') await page.screenshot({'path': 'example.png'}) await browser.close()
def setUpClass(cls): cls.port = get_free_port() cls.app = get_application() cls.server = cls.app.listen(cls.port) cls.browser = sync(launch(DEFAULT_OPTIONS)) cls.url = 'http://localhost:{}/'.format(cls.port)
def setUp(self): self.browser = sync(launch(args=['--no-sandbox'])) self.target_path = Path(__file__).resolve().parent / 'test.pdf' if self.target_path.exists(): self.target_path.unlink()