Python Crawler 예제들, crawling.Crawler Python 예제들

예제 #1

0

파일 보기

파일: test.py 프로젝트: kennywbin/500lines

 def test_lenient_host_checking(self):
     crawler = crawling.Crawler(['http://example.com'],
                                strict=False,
                                loop=self.loop)
     self.addCleanup(crawler.close)
     self.assertTrue(crawler.url_allowed("http://www.example.com"))
     self.assertTrue(crawler.url_allowed("http://foo.example.com"))

예제 #2

0

파일 보기

파일: test.py 프로젝트: kennywbin/500lines

 def test_exclude(self):
     crawler = crawling.Crawler(['http://example.com'],
                                exclude=r'.*pattern',
                                loop=self.loop)
     self.addCleanup(crawler.close)
     self.assertTrue(crawler.url_allowed("http://example.com"))
     self.assertFalse(crawler.url_allowed("http://example.com/pattern"))

예제 #3

0

파일 보기

 def test_roots(self):
     crawler = crawling.Crawler(['http://a', 'http://b', 'not-a-host'],
                                loop=self.loop)
     self.assertTrue(crawler.url_allowed("http://a/a"))
     self.assertTrue(crawler.url_allowed("http://b/b"))
     self.assertFalse(crawler.url_allowed("http://c/c"))
     self.assertFalse(crawler.url_allowed("http://127.0.0.1"))

예제 #4

0

파일 보기

def main():
    args = ARGS.parse_args()
    if '://' not in args.url:
        args.url = 'http://' + args.url

    loop = asyncio.get_event_loop()
    host = urlparse(args.url).netloc
    crawler = crawling.Crawler(args.url,
                               max_redirect=args.max_redirect,
                               max_tries=args.max_tries,
                               max_tasks=args.max_tasks,
                               host=host)

    try:
        loop.run_until_complete(crawler.crawl())
    except KeyboardInterrupt:
        import sys
        sys.stderr.flush()
        print('\nInterrupted\n')
    finally:
        reporting.report(crawler)
        loop.run_until_complete(crawler.close())
        loop.stop()
        loop.run_forever()
        loop.close()

예제 #5

0

파일 보기

파일: test.py 프로젝트: kennywbin/500lines

 def crawl(self, urls=None, *args, **kwargs):
     if self.crawler:
         self.crawler.close()
     if urls is None:
         urls = [self.app_url]
     self.crawler = crawling.Crawler(urls, *args, loop=self.loop, **kwargs)
     self.addCleanup(self.crawler.close)
     self.loop.run_until_complete(self.crawler.crawl())

예제 #6

0

파일 보기

파일: crawl.py 프로젝트: koolkt/python_crawler

def main():
    """Main program.

    Parse arguments, set up event loop, run crawler, print report.
    """
    args = ARGS.parse_args()
    levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG]
    logging.basicConfig(level=levels[min(args.level, len(levels) - 1)])

    if args.iocp:
        from asyncio.windows_events import ProactorEventLoop
        loop = ProactorEventLoop()
        asyncio.set_event_loop(loop)
    elif args.select:
        loop = asyncio.SelectorEventLoop()
        asyncio.set_event_loop(loop)
    else:
        loop = asyncio.get_event_loop()

    if not args.roots:
        r = redis.StrictRedis(host='localhost', port=6379, db=0)
        data = [r.blpop('queue:urls_to_crawl')]
        # data.append(r.blpop('queue:urls_to_crawl'))
        # data.append(r.blpop('queue:urls_to_crawl'))
        roots, scrape_data = init_data(data)
        s = None  #Scraper(scrape_data)
    else:
        roots = {fix_url(root) for root in args.roots}
        s = None

    crawler = crawling.Crawler(roots,
                               scraper=s,
                               data_handler=None,
                               exclude=args.exclude,
                               strict=args.strict,
                               max_redirect=args.max_redirect,
                               max_tries=args.max_tries,
                               max_tasks=args.max_tasks)
    try:
        loop.run_until_complete(crawler.crawl())  # Crawler gonna crawl.
    except KeyboardInterrupt:
        sys.stderr.flush()
        print('\nInterrupted\n')
    finally:
        reporting.report(crawler)  ########## REPORTING
        crawler.close()

        # next two lines are required for actual aiohttp resource cleanup
        loop.stop()
        loop.run_forever()

        loop.close()

예제 #7

0

파일 보기

파일: crawl.py 프로젝트: lusanshi/asyncio_crawler

def main():
    """Main program.

    Parse arguments, set up event loop, run crawler, print report.
    """
    args = ARGS.parse_args()
    if not args.roots:
        print('Use --help for command line help')
        return

    levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG]
    logging.basicConfig(level=levels[min(args.level, len(levels) - 1)])

    if args.iocp:
        from asyncio.windows_events import ProactorEventLoop
        loop = ProactorEventLoop()
        asyncio.set_event_loop(loop)
    elif args.select:
        loop = asyncio.SelectorEventLoop()
        asyncio.set_event_loop(loop)
    else:
        loop = asyncio.get_event_loop()

    roots = {fix_url(root) for root in args.roots}

    crawler = crawling.Crawler(
        roots,
        exclude=args.exclude,
        strict=args.strict,
        max_redirect=args.max_redirect,
        max_tries=args.max_tries,
        max_tasks=args.max_tasks,
        proxy=args.proxy,
        loop=loop,
    )
    try:
        loop.run_until_complete(crawler.crawl())  # Crawler gonna crawl.
    except KeyboardInterrupt:
        sys.stderr.flush()
        print('\nInterrupted\n')
    finally:
        now = datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')
        with open("{}.log".format(now), "w") as f:
            reporting.report(crawler, file=f)

        # next two lines are required for actual aiohttp resource cleanup
        loop.stop()
        loop.run_forever()

        loop.close()

예제 #8

0

파일 보기

파일: crawl.py 프로젝트: oguzhalit/60-days-of-python

def main():
    "Parse arguments, set up event loop, run crawler and print a report."

    levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG]
    if args["--quiet"]:
        logging.basicConfig(level=levels[0])
    else:
        logging.basicConfig(level=levels[int(args["--verbose"])])

    # Not sure how to set --strict to True by default with docopts. So this is
    # where we handle strict vs lenient.
    if args["--lenient"]:
        args["--strict"] = False
    else:
        args["--strict"] = True

    if args["--iocp"]:
        from asyncio.windows_events import ProactorEventLoop
        loop = ProactorEventLoop()
        asyncio.set_event_loop(loop)
    elif args["--select"]:
        loop = asyncio.SelectorEventLoop()
        asyncio.set_event_loop(loop)
    else:
        loop = asyncio.get_event_loop()

    # Set comprehension to avoid redundancy.
    roots = {fix_url(root) for root in args["<root>"]}

    # Instantiating the crawler with our arguments.
    crawler = crawling.Crawler(roots,
                               exclude=args["--exclude"],
                               strict=args["--strict"],
                               max_redirect=int(args["--max-redirect"]),
                               max_tries=int(args["--max-tries"]),
                               max_tasks=int(args["--max-tasks"]),
                               max_pool=int(args["--max-pool"])
                               )

    # "And this is where the magic happens."
    try:
        loop.run_until_complete(crawler.crawl())
    except KeyboardInterrupt:
        sys.stderr.flush()
        print('\nInterrupted\n')
    finally:
        reporting.report(crawler)
        crawler.close()
        loop.close()

예제 #9

0

파일 보기

파일: crawl.py 프로젝트: 83286415/500lines

def main():
    """Main program.

    Parse arguments, set up event loop, run crawler, print report.
    """
    args = ARGS.parse_args()
    if not args.roots:
        print('Use --help for command line help')
        return

    levels = [logging.ERROR, logging.WARN, logging.INFO,
              logging.DEBUG]  # ERROR=0 WARN=1 越小越严重
    logging.basicConfig(level=levels[min(args.level, len(levels) - 1)])

    # 以下条件语句内区分了不同的循环方式，IOCP,select等，涉及系统底层socket操作，代码层面略。
    if args.iocp:
        from asyncio.windows_events import ProactorEventLoop
        loop = ProactorEventLoop()
        asyncio.set_event_loop(loop)
    elif args.select:  # 效率较低
        loop = asyncio.SelectorEventLoop()
        asyncio.set_event_loop(loop)
    else:
        loop = asyncio.get_event_loop()  # 默认循环方式

    roots = {fix_url(root) for root in args.roots}  # args.roots is a list

    crawler = crawling.Crawler(
        roots,
        exclude=args.exclude,
        strict=args.strict,
        max_redirect=args.max_redirect,
        max_tries=args.max_tries,
        max_tasks=args.max_tasks,
    )
    try:
        loop.run_until_complete(crawler.crawl())  # Crawler gonna crawl.
    except KeyboardInterrupt:
        sys.stderr.flush()  # 清理内存
        print('\nInterrupted\n')
    finally:
        reporting.report(crawler)  # 打印爬取结果，或输出结果到文件
        crawler.close()  # aiohttp loop close

        # next two lines are required for actual aiohttp resource cleanup
        loop.stop()
        loop.run_forever()  # clean up process

        loop.close()  # 移除signal处理器

예제 #10

0

파일 보기

파일: crawl.py 프로젝트: chtcvl/crawler

def main():
    """Main program.

    Parse arguments, set up event loop, run crawler, print report.
    """
    args = ARGS.parse_args()
    # if not args.roots:
    #     print('Use --help for command line help')
    #     return

    levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG]
    logging.basicConfig(level=levels[min(args.level, len(levels) - 1)])

    if args.iocp:
        from asyncio.windows_events import ProactorEventLoop
        loop = ProactorEventLoop()
        asyncio.set_event_loop(loop)
    elif args.select:
        loop = asyncio.SelectorEventLoop()
        asyncio.set_event_loop(loop)
    else:
        loop = asyncio.get_event_loop()

    roots = {fix_url(root) for root in args.roots}
    roots = {'https://sauna.ru/'}
    crawler = crawling.Crawler(
        roots,
        exclude='\/(review|news|addFavorite|panorama|comment)',
        strict=args.strict,
        max_redirect=args.max_redirect,
        max_tries=args.max_tries,
        max_tasks=args.max_tasks,
        scrape_nonhtml=False,
    )
    try:
        loop.run_until_complete(crawler.dbconnect())
        loop.run_until_complete(crawler.crawl())  # Crawler gonna crawl.
    except KeyboardInterrupt:
        sys.stderr.flush()
        print('\nInterrupted\n')
    finally:
        reporting.report(crawler)
        crawler.close()

        # next two lines are required for actual aiohttp resource cleanup
        loop.stop()
        loop.run_forever()

        loop.close()

예제 #11

0

파일 보기

def crawl(url):
    loop = asyncio.get_event_loop()
    crawler = crawling.Crawler(url)
    loop.run_until_complete(crawler.crawl())  # Crawler gonna crawl.
    domainList = []
    domain = crawler.roots.pop()
    domainList.append(domain)
    show = list(crawler.done)
    show.sort(key=lambda _stat: _stat.url)
    addressList = []
    for stat in show:
        addressList.append(stat.url[len(domain):])
    domainList.append(addressList)
    print(domainList)
    crawler.close()
    loop.close()

예제 #12

0

파일 보기

def main():
    args = ARGS.parse_args()
    if not args.roots:
        print('Use --help for command line help')
        return

    levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG]
    logging.basicConfig(level=levels[min(args.level, len(levels) - 1)])

    if args.iocp:
        from asyncio.windows_events import ProactorEventLoop
        loop = ProactorEventLoop()
        asyncio.set_event_loop(loop)
    elif args.select:
        loop = asyncio.SelectorEventLoop()
        asyncio.set_event_loop(loop)
    else:
        loop = asyncio.get_event_loop()

    roots = {fix_url(root) for root in args.roots}

    crawler = crawling.Crawler(
        roots,
        exclude=args.exclude,
        strict=args.strict,
        max_redirect=args.max_redirect,
        max_tries=args.max_tries,
        max_tasks=args.max_tasks,
    )
    try:
        loop.run_until_complete(crawler.crawl())
    except KeyboardInterrupt:
        sys.stderr.flush()
        print('\nInterrupted\n')
    finally:
        reporting.report(crawler)
        loop.run_until_complete(crawler.close())

        loop.stop()
        loop.run_forever()

        loop.close()

예제 #13

0

파일 보기

파일: test.py 프로젝트: kennywbin/500lines

 def test_deep_root(self):
     # Make sure 'a' is a root domain if the root is a link deep in 'a'.
     crawler = crawling.Crawler(['http://a/a#fragment'], loop=self.loop)
     self.addCleanup(crawler.close)
     self.assertTrue(crawler.url_allowed("http://a/b"))

예제 #14

0

파일 보기

def main():
    """Main program.

    Parse arguments, set up event loop, run crawler, print report.
    """
    args = ARGS.parse_args()
    if not args.roots:
        print('Use --help for command line help')
        return

    global config
    global headers

    config = configparser.ConfigParser()
    config.read('client_app.ini')

    headers = {"User-Agent": config['client']['user-agent']}

    # @todo: figure out what to do with these. Currently just for creating the auth URL
    scopes = [
        'publicData', 'characterContactsRead', 'characterFittingsRead',
        'characterLocationRead'
    ]

    if args.auth:
        id = bytes("{}:{}".format(config['client']['Key'],
                                  config['client']['secret']),
                   encoding="utf-8")
        headers.update({
            "Authorization": b"Basic " + base64.b64encode(id),
            "Content-Type": "application/x-www-form-urlencoded"
        })

        if config['client'].get('refresh', None) and not args.invalid:
            print("Using Refresh token to login")
            # do requests here to get auth/refresh code and stick them in config (save maybe?)
            r = requests.post(
                'https://login.eveonline.com/oauth/token',
                data="grant_type=refresh_token&refresh_token={}".format(
                    config['client']['refresh']),
                headers=headers).json()
            headers.update(
                {"Authorization": "Bearer {}".format(r['access_token'])})
        else:

            def handleLogin(httpd, parts):
                # do requests here to get auth/refresh code and stick them in config (save maybe?)
                r = requests.post(
                    'https://login.eveonline.com/oauth/token',
                    data="grant_type=authorization_code&code={}".format(
                        parts['code'][0]),
                    headers=headers).json()

                config["client"]["refresh"] = r['refresh_token']
                with open('client_app.ini', 'w') as configfile:
                    config.write(configfile)

                headers.update(
                    {"Authorization": "Bearer {}".format(r['access_token'])})
                httpd.stop()

            httpd = StoppableHTTPServer(('', 6789), AuthHandler)
            url = "https://login.eveonline.com/oauth/authorize/?response_type=code&scope={}&redirect_uri=http://localhost:6789/&client_id={}".format(
                "+".join(scopes), config['client']['key'])
            print("Please go here to authenticate: \n {}".format(url))
            httpd.serve(handleLogin)

    levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG]
    logging.basicConfig(level=levels[min(args.level, len(levels) - 1)])

    if args.iocp:
        from asyncio.windows_events import ProactorEventLoop
        loop = ProactorEventLoop()
        asyncio.set_event_loop(loop)
    elif args.select:
        loop = asyncio.SelectorEventLoop()
        asyncio.set_event_loop(loop)
    else:
        loop = asyncio.get_event_loop()

    roots = {fix_url(root) for root in args.roots}

    crawler = crawling.Crawler(
        roots,
        exclude=args.exclude,
        strict=args.strict,
        max_redirect=args.max_redirect,
        max_tries=args.max_tries,
        max_tasks=args.max_tasks,
        headers=headers,
        follow_pages=args.follow_pages,
    )
    try:
        loop.run_until_complete(crawler.crawl())  # Crawler gonna crawl.
    except KeyboardInterrupt:
        sys.stderr.flush()
        print('\nInterrupted\n')
    finally:
        reporting.report(crawler)
        crawler.close()

        # next two lines are required for actual aiohttp resource cleanup
        loop.stop()
        loop.run_forever()

        loop.close()

예제 #15

0

파일 보기

 def test_strict_host_checking(self):
     crawler = crawling.Crawler(['http://example.com'], loop=self.loop)
     self.assertTrue(crawler.url_allowed("http://www.example.com"))
     self.assertFalse(crawler.url_allowed("http://foo.example.com"))

예제 #16

0

파일 보기

파일: main.py 프로젝트: swampni/ceiba-crawler

import crawling
import calender
import parsing
import getpass
import os
from bs4 import BeautifulSoup as bs

# check for same event
# upload new description
# upload new event
user_name = input('請輸入ceiba帳號')
password = getpass.getpass('請輸入ceiba密碼')
flag = True
while flag:
    try:
        c = crawling.Crawler(user_name, password)
        flag = False
    except crawling.UserNamePassWordError:
        print('Wrong username or password')
        user_name = input('請輸入ceiba帳號')
        password = getpass.getpass('請輸入ceiba密碼')
for course in c.courses:
    c.get_homework(course)
    c.get_syllabus(course)
c.halt_browser()
one = timer()

idnew = calender.make_calender(c.user)
cal_id = idnew[0]
olduser = idnew[1]