Python Crawler.crawl примеры использования

Язык программирования: Python

Пространство имен/Пакет: crawling.crawler

Класс/Тип: Crawler

Метод/Функция: crawl

Примеров на hotexamples.com: 3

Python Crawler.crawl - 3 примера найдено. Это лучшие примеры Python кода для crawling.crawler.Crawler.crawl, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Crawler(8)

crawl(3)

__init__(1)

get_home_topics(1)

get_topics_by_home_topic(1)

setMaxHitsPerMin(1)

setUserAgent(1)

Пример #1

Показать файл

Файл: crawler_acceptance_test.py Проект: ralphavalon/avaloncrawler

    def test_crawl_with_csv(self):
        urlparse.urljoin = self.urljoin

        crawler = Crawler()
        epocaCosmeticos = EpocaCosmeticos()
        epocaCosmeticos.get_home_page = self.get_home_page
        epocaCosmeticos.get_product_pages = self.get_product_pages
        crawled_dict = crawler.crawl([epocaCosmeticos])

        base_path = os.path.abspath('.') + os.sep + 'tests'
        file_base_path = 'file:///' + base_path
        products = crawled_dict['EpocaCosmeticos']
        self.assertEquals(1, len(crawled_dict))
        self.assertEquals(2, len(products))
        self.assertEquals('Produto 1', products[0].name)
        self.assertEquals('Titulo do Produto 1', products[0].title)
        self.assertTrue(
            os.path.join(base_path, 'produto_1.html') in products[0].url)
        self.assertEquals('Produto 2', products[1].name)
        self.assertEquals('Titulo do Produto 2', products[1].title)
        self.assertTrue(
            os.path.join(base_path, 'produto_2.html') in products[1].url)

        exporter = CSVExporter()
        filename = ''
        for crawlable_name, exportable_list in crawled_dict.iteritems():
            exporter.export(crawlable_name, exportable_list)
            filename = exporter.__get_filename__(crawlable_name)

        self.assertTrue(os.path.getsize(filename) > 0)
        os.remove(filename)

Пример #2

Показать файл

Файл: crawl.py Проект: dimo414/pycrawl

    
    result_handler = logging.FileHandler(os.path.join(opts.log_dir, 'pycrawl.results.log'), 'w')
    result_handler.setFormatter(just_message_fmt)
    result_handler.setLevel(logging.INFO)
    logging.getLogger('result').addHandler(result_handler)
    
    err_handler = logging.StreamHandler(sys.stderr)
    err_handler.setLevel(logging.WARN)
    err_handler.setFormatter(logging.Formatter('%(name)-10s %(levelname)-8s  %(message)s'))
    logging.getLogger('').addHandler(err_handler)
    
    if opts.print_pages or opts.print_results:
      out_handler = logging.StreamHandler(sys.stdout)
      out_handler.setLevel(logging.INFO)
      out_handler.setFormatter(just_message_fmt)
      if opts.print_pages:
        logging.getLogger('page').addHandler(out_handler)
      if opts.print_results:
        logging.getLogger('result').addHandler(out_handler)

if __name__ == '__main__':
    """Sample crawl application.  Creates a crawler which will crawl local URLs and print URLs with links containing 'California'.
    Specify a custom user agent (Chrome 24 on Win7) and throttle requests to 10 per minute.  Then crawl the English Wikipedia, hitting
    the homepage and all pages linked to from that page, but no deeper."""
    configure(sys.argv)
    
    crawl = Crawler(test.basic.isLocal, action.basic.tagContains('a', 'California'))
    crawl.setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17")
    crawl.setMaxHitsPerMin(10)
    crawl.crawl('http://en.wikipedia.org/wiki/Main_Page',1)

Пример #3

Показать файл

Файл: start_crawling.py Проект: ralphavalon/avaloncrawler

# coding: utf-8

from crawling.crawler import Crawler
from service.impl.epoca_cosmeticos import EpocaCosmeticos
from service.impl.csv_exporter import CSVExporter

crawler = Crawler()
crawled_dict = crawler.crawl([EpocaCosmeticos()], max_delay=5)

exporter = CSVExporter()

for crawlable_name, exportable_list in crawled_dict.iteritems():
    exporter.export(crawlable_name, exportable_list)