def main(): args = ARGS.parse_args() if '://' not in args.url: args.url = 'http://' + args.url loop = asyncio.get_event_loop() host = urlparse(args.url).netloc crawler = crawling.Crawler(args.url, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, host=host) try: loop.run_until_complete(crawler.crawl()) except KeyboardInterrupt: import sys sys.stderr.flush() print('\nInterrupted\n') finally: reporting.report(crawler) loop.run_until_complete(crawler.close()) loop.stop() loop.run_forever() loop.close()
def main(): """ Main program. Parse arguments, run crawler, print report """ args = ARGS.parse_args() if not args.b and not args.f: print "Use --help for command line help" return if args.b: bugs = {fix_url(bug) for bug in args.b} else: bugs = [fix_url(bug) for bug in read(args.f[0])] try: output = [] start_time = time.time() for bug in bugs: result = crawler.download(bug) output.append(result) total_time = round(time.time() - start_time, 2) print "It took %s seconds to download %s bug reports!" % (total_time, len(bugs)) report(output) except KeyboardInterrupt: print "Interupted!" except crawler.BugNotFound, e: print "An error occurred while crawling bug: " + bug print e.message
def create_spider(raw_url): levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] loop = asyncio.get_event_loop() crawler = crawling.Crawler(raw_url, exclude=None, strict=True, max_redirect=10, max_tries=4, max_tasks=10, ) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: return False sys.stderr.flush() print('\nInterrupted\n') finally: reporting.report(crawler) crawler.close() return True
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() if not args.roots: r = redis.StrictRedis(host="localhost", port=6379, db=0) data = [r.blpop("queue:urls_to_crawl")] # data.append(r.blpop('queue:urls_to_crawl')) # data.append(r.blpop('queue:urls_to_crawl')) roots, scrape_data = init_data(data) s = None # Scraper(scrape_data) else: roots = {fix_url(root) for root in args.roots} s = None crawler = crawling.Crawler( roots, scraper=s, data_handler=None, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, ) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print("\nInterrupted\n") finally: reporting.report(crawler) ########## REPORTING crawler.close() # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() loop.close()
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() if not args.roots: r = redis.StrictRedis(host='localhost', port=6379, db=0) data = [r.blpop('queue:urls_to_crawl')] # data.append(r.blpop('queue:urls_to_crawl')) # data.append(r.blpop('queue:urls_to_crawl')) roots, scrape_data = init_data(data) s = None #Scraper(scrape_data) else: roots = {fix_url(root) for root in args.roots} s = None crawler = crawling.Crawler(roots, scraper=s, data_handler=None, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: reporting.report(crawler) ########## REPORTING crawler.close() # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() loop.close()
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() if not args.roots: print('Use --help for command line help') return levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() roots = {fix_url(root) for root in args.roots} crawler = crawling.Crawler( roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, proxy=args.proxy, loop=loop, ) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: now = datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S') with open("{}.log".format(now), "w") as f: reporting.report(crawler, file=f) # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() loop.close()
def main(): "Parse arguments, set up event loop, run crawler and print a report." levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] if args["--quiet"]: logging.basicConfig(level=levels[0]) else: logging.basicConfig(level=levels[int(args["--verbose"])]) # Not sure how to set --strict to True by default with docopts. So this is # where we handle strict vs lenient. if args["--lenient"]: args["--strict"] = False else: args["--strict"] = True if args["--iocp"]: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args["--select"]: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() # Set comprehension to avoid redundancy. roots = {fix_url(root) for root in args["<root>"]} # Instantiating the crawler with our arguments. crawler = crawling.Crawler(roots, exclude=args["--exclude"], strict=args["--strict"], max_redirect=int(args["--max-redirect"]), max_tries=int(args["--max-tries"]), max_tasks=int(args["--max-tasks"]), max_pool=int(args["--max-pool"]) ) # "And this is where the magic happens." try: loop.run_until_complete(crawler.crawl()) except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: reporting.report(crawler) crawler.close() loop.close()
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() if not args.roots: print('Use --help for command line help') return levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] # ERROR=0 WARN=1 越小越严重 logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) # 以下条件语句内区分了不同的循环方式,IOCP,select等,涉及系统底层socket操作,代码层面略。 if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: # 效率较低 loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() # 默认循环方式 roots = {fix_url(root) for root in args.roots} # args.roots is a list crawler = crawling.Crawler( roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, ) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() # 清理内存 print('\nInterrupted\n') finally: reporting.report(crawler) # 打印爬取结果,或输出结果到文件 crawler.close() # aiohttp loop close # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() # clean up process loop.close() # 移除signal处理器
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() # if not args.roots: # print('Use --help for command line help') # return levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() roots = {fix_url(root) for root in args.roots} roots = {'https://sauna.ru/'} crawler = crawling.Crawler( roots, exclude='\/(review|news|addFavorite|panorama|comment)', strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, scrape_nonhtml=False, ) try: loop.run_until_complete(crawler.dbconnect()) loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: reporting.report(crawler) crawler.close() # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() loop.close()
def main(): loop = asyncio.get_event_loop() roots = ("http://doc.1.com/platform/realname/", ) crawler = Crawler(roots) try: loop.run_until_complete(crawler.crawl()) except KeyboardInterrupt: sys.stderr.flush() print("\nInterrupted\n") finally: f = open("report.txt", "w+") report(crawler, file=f) crawler.close() loop.stop() loop.run_forever() loop.close()
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() if not args.roots: print('Use --help for command line help') return levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] # logging.basicConfig(filename='crawl.log',level=levels[min(args.level, len(levels)-1)]) logging.basicConfig(filename='crawl.log',level=levels[min(args.level, len(levels)-1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() roots = {fix_url(root) for root in args.roots} crawler = crawling.Crawler(roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, ) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: reporting.report(crawler) crawler.close() # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() loop.close()
def generate_reports(): print '\nGenerating reports for %d team members.' % len(s['members']) members = s['members'] for member in members: print "\nReport for %s:" % member # get case resolves projects, project_cases = reporting.get_resolved_cases(s['config'], member, today, lastweek) resolves = len(project_cases) print 'Resolves: %d' % resolves activity_projects, activity_cases = reporting.get_case_activity(s['config'], member, today, lastweek) activity = len(activity_cases) print 'Activity: %d' % activity member_repo_list, changeset_list = reporting.get_commits(s['config'], member, today, lastweek) commits = 0 for item in changeset_list: for key, value in item.iteritems(): commits += len(value) print 'Commits: %d' % commits member.add_overview_data({'date' : datetime.now(), 'resolves' : resolves, 'activity' : activity, 'commits' : commits}) save_dir = os.path.join(s['config']['home'], "reports", member.username) if not os.path.exists(save_dir): os.makedirs(save_dir) graph_file = graphing.graph(member, save_dir) reporting.report(member, save_dir, graph_file, projects, project_cases, activity_projects, activity_cases, member_repo_list, changeset_list) s['members'] = members
def main(): args = ARGS.parse_args() if not args.roots: print('Use --help for command line help') return levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() roots = {fix_url(root) for root in args.roots} crawler = crawling.Crawler( roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, ) try: loop.run_until_complete(crawler.crawl()) except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: reporting.report(crawler) loop.run_until_complete(crawler.close()) loop.stop() loop.run_forever() loop.close()
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() if not args.roots: print('Use --help for command line help') return global config global headers config = configparser.ConfigParser() config.read('client_app.ini') headers = {"User-Agent": config['client']['user-agent']} # @todo: figure out what to do with these. Currently just for creating the auth URL scopes = [ 'publicData', 'characterContactsRead', 'characterFittingsRead', 'characterLocationRead' ] if args.auth: id = bytes("{}:{}".format(config['client']['Key'], config['client']['secret']), encoding="utf-8") headers.update({ "Authorization": b"Basic " + base64.b64encode(id), "Content-Type": "application/x-www-form-urlencoded" }) if config['client'].get('refresh', None) and not args.invalid: print("Using Refresh token to login") # do requests here to get auth/refresh code and stick them in config (save maybe?) r = requests.post( 'https://login.eveonline.com/oauth/token', data="grant_type=refresh_token&refresh_token={}".format( config['client']['refresh']), headers=headers).json() headers.update( {"Authorization": "Bearer {}".format(r['access_token'])}) else: def handleLogin(httpd, parts): # do requests here to get auth/refresh code and stick them in config (save maybe?) r = requests.post( 'https://login.eveonline.com/oauth/token', data="grant_type=authorization_code&code={}".format( parts['code'][0]), headers=headers).json() config["client"]["refresh"] = r['refresh_token'] with open('client_app.ini', 'w') as configfile: config.write(configfile) headers.update( {"Authorization": "Bearer {}".format(r['access_token'])}) httpd.stop() httpd = StoppableHTTPServer(('', 6789), AuthHandler) url = "https://login.eveonline.com/oauth/authorize/?response_type=code&scope={}&redirect_uri=http://localhost:6789/&client_id={}".format( "+".join(scopes), config['client']['key']) print("Please go here to authenticate: \n {}".format(url)) httpd.serve(handleLogin) levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() roots = {fix_url(root) for root in args.roots} crawler = crawling.Crawler( roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, headers=headers, follow_pages=args.follow_pages, ) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: reporting.report(crawler) crawler.close() # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() loop.close()
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() if not args.roots: print('Use --help for command line help') return global config global headers config = configparser.ConfigParser() config.read('client_app.ini') headers = { "User-Agent": config['client']['user-agent'] } # @todo: figure out what to do with these. Currently just for creating the auth URL scopes = [ 'publicData', 'characterContactsRead', 'characterFittingsRead', 'characterLocationRead' ] if args.auth: id = bytes("{}:{}".format(config['client']['Key'], config['client']['secret']), encoding="utf-8") headers.update({ "Authorization": b"Basic " + base64.b64encode(id), "Content-Type": "application/x-www-form-urlencoded" }) if config['client'].get('refresh', None) and not args.invalid: print("Using Refresh token to login") # do requests here to get auth/refresh code and stick them in config (save maybe?) r = requests.post('https://login.eveonline.com/oauth/token', data="grant_type=refresh_token&refresh_token={}".format(config['client']['refresh']), headers=headers).json() headers.update({"Authorization": "Bearer {}".format(r['access_token'])}) else: def handleLogin(httpd, parts): # do requests here to get auth/refresh code and stick them in config (save maybe?) r = requests.post('https://login.eveonline.com/oauth/token', data="grant_type=authorization_code&code={}".format(parts['code'][0]), headers=headers).json() config["client"]["refresh"] = r['refresh_token'] with open('client_app.ini', 'w') as configfile: config.write(configfile) headers.update({"Authorization": "Bearer {}".format(r['access_token'])}) httpd.stop() httpd = StoppableHTTPServer(('', 6789), AuthHandler) url = "https://login.eveonline.com/oauth/authorize/?response_type=code&scope={}&redirect_uri=http://localhost:6789/&client_id={}".format("+".join(scopes), config['client']['key']) print("Please go here to authenticate: \n {}".format(url)) httpd.serve(handleLogin) levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels)-1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() roots = {fix_url(root) for root in args.roots} crawler = crawling.Crawler(roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, headers=headers, follow_pages=args.follow_pages, ) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: reporting.report(crawler) crawler.close() # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() loop.close()
def report(self, path,version="unknown"): report = reporting.report(self.windSpeedBins, self.turbulenceBins,version) report.report(path, self)
#!/usr/bin/env python2.7 from __future__ import absolute_import, division, print_function, unicode_literals import sys from os import path ourdir = path.realpath(path.dirname(__file__)) sys.path.insert(0, ourdir) import reporting argv = sys.argv[1:] start, end = reporting.parse(argv) cmd = ['ledger'] + argv + reporting.list_datfiles(start, end) reporting.report(cmd)
) else: logging.basicConfig(\ level = leveldict[loglevel], \ format = logfmt, \ datefmt = datefmt \ ) storage = FileManager(datadir, False, False) gazette_ia = GazetteIA(storage, access_key, secret_key, loglevel, logfile) stats = Stats() if relurls: for relurl in relurls: handle_relurl(gazette_ia, relurl, to_upload, to_update, stats) elif from_stdin: for line in sys.stdin: relurl = line.strip() handle_relurl(gazette_ia, relurl, to_upload, to_update, stats) else: for relurl in storage.find_matching_relurls(srcnames, start_ts, end_ts): handle_relurl(gazette_ia, relurl, to_upload, to_update, stats) if to_addrs: msg = stats.get_message(srcnames) reporting.report(gmail_user, gmail_pwd, to_addrs, \ 'Stats for gazette on %s' % datetime.date.today(), msg)