Python Crawler.crawl 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: crawling.crawler

클래스/타입: Crawler

메소드/함수: crawl

hotexamples.com에서의 예제들: 3

Python Crawler.crawl - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 crawling.crawler.Crawler.crawl에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Crawler(8)

crawl(3)

__init__(1)

get_home_topics(1)

get_topics_by_home_topic(1)

setMaxHitsPerMin(1)

setUserAgent(1)

예제 #1

파일 보기

파일: crawler_acceptance_test.py 프로젝트: ralphavalon/avaloncrawler

    def test_crawl_with_csv(self):
        urlparse.urljoin = self.urljoin

        crawler = Crawler()
        epocaCosmeticos = EpocaCosmeticos()
        epocaCosmeticos.get_home_page = self.get_home_page
        epocaCosmeticos.get_product_pages = self.get_product_pages
        crawled_dict = crawler.crawl([epocaCosmeticos])

        base_path = os.path.abspath('.') + os.sep + 'tests'
        file_base_path = 'file:///' + base_path
        products = crawled_dict['EpocaCosmeticos']
        self.assertEquals(1, len(crawled_dict))
        self.assertEquals(2, len(products))
        self.assertEquals('Produto 1', products[0].name)
        self.assertEquals('Titulo do Produto 1', products[0].title)
        self.assertTrue(
            os.path.join(base_path, 'produto_1.html') in products[0].url)
        self.assertEquals('Produto 2', products[1].name)
        self.assertEquals('Titulo do Produto 2', products[1].title)
        self.assertTrue(
            os.path.join(base_path, 'produto_2.html') in products[1].url)

        exporter = CSVExporter()
        filename = ''
        for crawlable_name, exportable_list in crawled_dict.iteritems():
            exporter.export(crawlable_name, exportable_list)
            filename = exporter.__get_filename__(crawlable_name)

        self.assertTrue(os.path.getsize(filename) > 0)
        os.remove(filename)

예제 #2

파일 보기

파일: crawl.py 프로젝트: dimo414/pycrawl

    
    result_handler = logging.FileHandler(os.path.join(opts.log_dir, 'pycrawl.results.log'), 'w')
    result_handler.setFormatter(just_message_fmt)
    result_handler.setLevel(logging.INFO)
    logging.getLogger('result').addHandler(result_handler)
    
    err_handler = logging.StreamHandler(sys.stderr)
    err_handler.setLevel(logging.WARN)
    err_handler.setFormatter(logging.Formatter('%(name)-10s %(levelname)-8s  %(message)s'))
    logging.getLogger('').addHandler(err_handler)
    
    if opts.print_pages or opts.print_results:
      out_handler = logging.StreamHandler(sys.stdout)
      out_handler.setLevel(logging.INFO)
      out_handler.setFormatter(just_message_fmt)
      if opts.print_pages:
        logging.getLogger('page').addHandler(out_handler)
      if opts.print_results:
        logging.getLogger('result').addHandler(out_handler)

if __name__ == '__main__':
    """Sample crawl application.  Creates a crawler which will crawl local URLs and print URLs with links containing 'California'.
    Specify a custom user agent (Chrome 24 on Win7) and throttle requests to 10 per minute.  Then crawl the English Wikipedia, hitting
    the homepage and all pages linked to from that page, but no deeper."""
    configure(sys.argv)
    
    crawl = Crawler(test.basic.isLocal, action.basic.tagContains('a', 'California'))
    crawl.setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17")
    crawl.setMaxHitsPerMin(10)
    crawl.crawl('http://en.wikipedia.org/wiki/Main_Page',1)

예제 #3

파일 보기

파일: start_crawling.py 프로젝트: ralphavalon/avaloncrawler

# coding: utf-8

from crawling.crawler import Crawler
from service.impl.epoca_cosmeticos import EpocaCosmeticos
from service.impl.csv_exporter import CSVExporter

crawler = Crawler()
crawled_dict = crawler.crawl([EpocaCosmeticos()], max_delay=5)

exporter = CSVExporter()

for crawlable_name, exportable_list in crawled_dict.iteritems():
    exporter.export(crawlable_name, exportable_list)