def test_crawl_with_csv(self): urlparse.urljoin = self.urljoin crawler = Crawler() epocaCosmeticos = EpocaCosmeticos() epocaCosmeticos.get_home_page = self.get_home_page epocaCosmeticos.get_product_pages = self.get_product_pages crawled_dict = crawler.crawl([epocaCosmeticos]) base_path = os.path.abspath('.') + os.sep + 'tests' file_base_path = 'file:///' + base_path products = crawled_dict['EpocaCosmeticos'] self.assertEquals(1, len(crawled_dict)) self.assertEquals(2, len(products)) self.assertEquals('Produto 1', products[0].name) self.assertEquals('Titulo do Produto 1', products[0].title) self.assertTrue( os.path.join(base_path, 'produto_1.html') in products[0].url) self.assertEquals('Produto 2', products[1].name) self.assertEquals('Titulo do Produto 2', products[1].title) self.assertTrue( os.path.join(base_path, 'produto_2.html') in products[1].url) exporter = CSVExporter() filename = '' for crawlable_name, exportable_list in crawled_dict.iteritems(): exporter.export(crawlable_name, exportable_list) filename = exporter.__get_filename__(crawlable_name) self.assertTrue(os.path.getsize(filename) > 0) os.remove(filename)
def get_all_topics(): crawler = Crawler() home_topics = crawler.get_home_topics() or [] logger.info(f'{len(home_topics)} home topics were found.') all_topics = [] for ht in home_topics: topics = crawler.get_topics_by_home_topic(ht) or [] logger.info(f'{len(topics)} topics were found under home topic {ht}.') all_topics.extend(topics) break # for debug return all_topics
def test_get_product(self): crawler = Crawler() base_path = os.path.abspath('.') + os.sep + 'tests' file_base_path = 'file:///' + base_path link = os.path.join(file_base_path, 'produto_1.html') epoca = EpocaCosmeticos() print epoca.get_product_pages() product = Page(EpocaCosmeticos(), link).get_product() self.assertEquals('Produto 1', product.name) self.assertEquals('Titulo do Produto 1', product.title) self.assertEquals(link, product.url)
def main(): redis = redis_init() mongo = Mongo() sync_redis_with_mongo(redis, mongo) crawler = Crawler() topics = get_all_topics(crawler, thread_num=4) batch_process_topics_data(redis, mongo, crawler, topics, batch=50, thread_num=8)
from __future__ import absolute_import import logging from utils.mongo import Mongo from utils.toolkit import logging_init, redis_init from crawling.crawler import Crawler logging_init() logger = logging.getLogger(__name__) def get_all_topics(): crawler = Crawler() home_topics = crawler.get_home_topics() or [] logger.info(f'{len(home_topics)} home topics were found.') all_topics = [] for ht in home_topics: topics = crawler.get_topics_by_home_topic(ht) or [] logger.info(f'{len(topics)} topics were found under home topic {ht}.') all_topics.extend(topics) break # for debug return all_topics if __name__ == '__main__': redis = redis_init() topics = get_all_topics() crawler = Crawler() topic_iter = map(crawler.get_topic_data, topics)
result_handler = logging.FileHandler(os.path.join(opts.log_dir, 'pycrawl.results.log'), 'w') result_handler.setFormatter(just_message_fmt) result_handler.setLevel(logging.INFO) logging.getLogger('result').addHandler(result_handler) err_handler = logging.StreamHandler(sys.stderr) err_handler.setLevel(logging.WARN) err_handler.setFormatter(logging.Formatter('%(name)-10s %(levelname)-8s %(message)s')) logging.getLogger('').addHandler(err_handler) if opts.print_pages or opts.print_results: out_handler = logging.StreamHandler(sys.stdout) out_handler.setLevel(logging.INFO) out_handler.setFormatter(just_message_fmt) if opts.print_pages: logging.getLogger('page').addHandler(out_handler) if opts.print_results: logging.getLogger('result').addHandler(out_handler) if __name__ == '__main__': """Sample crawl application. Creates a crawler which will crawl local URLs and print URLs with links containing 'California'. Specify a custom user agent (Chrome 24 on Win7) and throttle requests to 10 per minute. Then crawl the English Wikipedia, hitting the homepage and all pages linked to from that page, but no deeper.""" configure(sys.argv) crawl = Crawler(test.basic.isLocal, action.basic.tagContains('a', 'California')) crawl.setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17") crawl.setMaxHitsPerMin(10) crawl.crawl('http://en.wikipedia.org/wiki/Main_Page',1)
def __init__(self, queue_name='crawling'): self.amqp = Amqp(queue_name, ['database']) self.crawler = Crawler(self.amqp) self.amqp.receive(self)
def __init__(self): Crawler.__init__(self) self.db = DatabaseConnector()
def crawl(): sync_redis_with_mongo() crawler = Crawler() all_topic_dicts = get_all_topics(crawler, thread_num=4) batch_process_topics_data(crawler, all_topic_dicts, batch=50, thread_num=8)
# coding: utf-8 from crawling.crawler import Crawler from service.impl.epoca_cosmeticos import EpocaCosmeticos from service.impl.csv_exporter import CSVExporter crawler = Crawler() crawled_dict = crawler.crawl([EpocaCosmeticos()], max_delay=5) exporter = CSVExporter() for crawlable_name, exportable_list in crawled_dict.iteritems(): exporter.export(crawlable_name, exportable_list)