def test_lenient_host_checking(self): crawler = crawling.Crawler(['http://example.com'], strict=False, loop=self.loop) self.addCleanup(crawler.close) self.assertTrue(crawler.url_allowed("http://www.example.com")) self.assertTrue(crawler.url_allowed("http://foo.example.com"))
def test_exclude(self): crawler = crawling.Crawler(['http://example.com'], exclude=r'.*pattern', loop=self.loop) self.addCleanup(crawler.close) self.assertTrue(crawler.url_allowed("http://example.com")) self.assertFalse(crawler.url_allowed("http://example.com/pattern"))
def test_roots(self): crawler = crawling.Crawler(['http://a', 'http://b', 'not-a-host'], loop=self.loop) self.assertTrue(crawler.url_allowed("http://a/a")) self.assertTrue(crawler.url_allowed("http://b/b")) self.assertFalse(crawler.url_allowed("http://c/c")) self.assertFalse(crawler.url_allowed("http://127.0.0.1"))
def main(): args = ARGS.parse_args() if '://' not in args.url: args.url = 'http://' + args.url loop = asyncio.get_event_loop() host = urlparse(args.url).netloc crawler = crawling.Crawler(args.url, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, host=host) try: loop.run_until_complete(crawler.crawl()) except KeyboardInterrupt: import sys sys.stderr.flush() print('\nInterrupted\n') finally: reporting.report(crawler) loop.run_until_complete(crawler.close()) loop.stop() loop.run_forever() loop.close()
def crawl(self, urls=None, *args, **kwargs): if self.crawler: self.crawler.close() if urls is None: urls = [self.app_url] self.crawler = crawling.Crawler(urls, *args, loop=self.loop, **kwargs) self.addCleanup(self.crawler.close) self.loop.run_until_complete(self.crawler.crawl())
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() if not args.roots: r = redis.StrictRedis(host='localhost', port=6379, db=0) data = [r.blpop('queue:urls_to_crawl')] # data.append(r.blpop('queue:urls_to_crawl')) # data.append(r.blpop('queue:urls_to_crawl')) roots, scrape_data = init_data(data) s = None #Scraper(scrape_data) else: roots = {fix_url(root) for root in args.roots} s = None crawler = crawling.Crawler(roots, scraper=s, data_handler=None, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: reporting.report(crawler) ########## REPORTING crawler.close() # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() loop.close()
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() if not args.roots: print('Use --help for command line help') return levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() roots = {fix_url(root) for root in args.roots} crawler = crawling.Crawler( roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, proxy=args.proxy, loop=loop, ) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: now = datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S') with open("{}.log".format(now), "w") as f: reporting.report(crawler, file=f) # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() loop.close()
def main(): "Parse arguments, set up event loop, run crawler and print a report." levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] if args["--quiet"]: logging.basicConfig(level=levels[0]) else: logging.basicConfig(level=levels[int(args["--verbose"])]) # Not sure how to set --strict to True by default with docopts. So this is # where we handle strict vs lenient. if args["--lenient"]: args["--strict"] = False else: args["--strict"] = True if args["--iocp"]: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args["--select"]: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() # Set comprehension to avoid redundancy. roots = {fix_url(root) for root in args["<root>"]} # Instantiating the crawler with our arguments. crawler = crawling.Crawler(roots, exclude=args["--exclude"], strict=args["--strict"], max_redirect=int(args["--max-redirect"]), max_tries=int(args["--max-tries"]), max_tasks=int(args["--max-tasks"]), max_pool=int(args["--max-pool"]) ) # "And this is where the magic happens." try: loop.run_until_complete(crawler.crawl()) except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: reporting.report(crawler) crawler.close() loop.close()
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() if not args.roots: print('Use --help for command line help') return levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] # ERROR=0 WARN=1 越小越严重 logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) # 以下条件语句内区分了不同的循环方式,IOCP,select等,涉及系统底层socket操作,代码层面略。 if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: # 效率较低 loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() # 默认循环方式 roots = {fix_url(root) for root in args.roots} # args.roots is a list crawler = crawling.Crawler( roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, ) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() # 清理内存 print('\nInterrupted\n') finally: reporting.report(crawler) # 打印爬取结果,或输出结果到文件 crawler.close() # aiohttp loop close # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() # clean up process loop.close() # 移除signal处理器
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() # if not args.roots: # print('Use --help for command line help') # return levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() roots = {fix_url(root) for root in args.roots} roots = {'https://sauna.ru/'} crawler = crawling.Crawler( roots, exclude='\/(review|news|addFavorite|panorama|comment)', strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, scrape_nonhtml=False, ) try: loop.run_until_complete(crawler.dbconnect()) loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: reporting.report(crawler) crawler.close() # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() loop.close()
def crawl(url): loop = asyncio.get_event_loop() crawler = crawling.Crawler(url) loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. domainList = [] domain = crawler.roots.pop() domainList.append(domain) show = list(crawler.done) show.sort(key=lambda _stat: _stat.url) addressList = [] for stat in show: addressList.append(stat.url[len(domain):]) domainList.append(addressList) print(domainList) crawler.close() loop.close()
def main(): args = ARGS.parse_args() if not args.roots: print('Use --help for command line help') return levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() roots = {fix_url(root) for root in args.roots} crawler = crawling.Crawler( roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, ) try: loop.run_until_complete(crawler.crawl()) except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: reporting.report(crawler) loop.run_until_complete(crawler.close()) loop.stop() loop.run_forever() loop.close()
def test_deep_root(self): # Make sure 'a' is a root domain if the root is a link deep in 'a'. crawler = crawling.Crawler(['http://a/a#fragment'], loop=self.loop) self.addCleanup(crawler.close) self.assertTrue(crawler.url_allowed("http://a/b"))
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() if not args.roots: print('Use --help for command line help') return global config global headers config = configparser.ConfigParser() config.read('client_app.ini') headers = {"User-Agent": config['client']['user-agent']} # @todo: figure out what to do with these. Currently just for creating the auth URL scopes = [ 'publicData', 'characterContactsRead', 'characterFittingsRead', 'characterLocationRead' ] if args.auth: id = bytes("{}:{}".format(config['client']['Key'], config['client']['secret']), encoding="utf-8") headers.update({ "Authorization": b"Basic " + base64.b64encode(id), "Content-Type": "application/x-www-form-urlencoded" }) if config['client'].get('refresh', None) and not args.invalid: print("Using Refresh token to login") # do requests here to get auth/refresh code and stick them in config (save maybe?) r = requests.post( 'https://login.eveonline.com/oauth/token', data="grant_type=refresh_token&refresh_token={}".format( config['client']['refresh']), headers=headers).json() headers.update( {"Authorization": "Bearer {}".format(r['access_token'])}) else: def handleLogin(httpd, parts): # do requests here to get auth/refresh code and stick them in config (save maybe?) r = requests.post( 'https://login.eveonline.com/oauth/token', data="grant_type=authorization_code&code={}".format( parts['code'][0]), headers=headers).json() config["client"]["refresh"] = r['refresh_token'] with open('client_app.ini', 'w') as configfile: config.write(configfile) headers.update( {"Authorization": "Bearer {}".format(r['access_token'])}) httpd.stop() httpd = StoppableHTTPServer(('', 6789), AuthHandler) url = "https://login.eveonline.com/oauth/authorize/?response_type=code&scope={}&redirect_uri=http://localhost:6789/&client_id={}".format( "+".join(scopes), config['client']['key']) print("Please go here to authenticate: \n {}".format(url)) httpd.serve(handleLogin) levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() roots = {fix_url(root) for root in args.roots} crawler = crawling.Crawler( roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, headers=headers, follow_pages=args.follow_pages, ) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: reporting.report(crawler) crawler.close() # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() loop.close()
def test_strict_host_checking(self): crawler = crawling.Crawler(['http://example.com'], loop=self.loop) self.assertTrue(crawler.url_allowed("http://www.example.com")) self.assertFalse(crawler.url_allowed("http://foo.example.com"))
import crawling import calender import parsing import getpass import os from bs4 import BeautifulSoup as bs # check for same event # upload new description # upload new event user_name = input('請輸入ceiba帳號') password = getpass.getpass('請輸入ceiba密碼') flag = True while flag: try: c = crawling.Crawler(user_name, password) flag = False except crawling.UserNamePassWordError: print('Wrong username or password') user_name = input('請輸入ceiba帳號') password = getpass.getpass('請輸入ceiba密碼') for course in c.courses: c.get_homework(course) c.get_syllabus(course) c.halt_browser() one = timer() idnew = calender.make_calender(c.user) cal_id = idnew[0] olduser = idnew[1]