def __init__(self, spec=None, spider=None, spiders=None, start_jobs=None, queue_path=None, threads=25, buffer_size=DEFAULT_IMAP_BUFFER_SIZE, throttle=DEFAULT_THROTTLE, join=True, daemonic=False): # NOTE: crawling could work depth-first but: # buffer_size should be 0 (requires to fix quenouille issue #1) # Params self.start_jobs = start_jobs self.queue_path = queue_path self.threads = threads self.buffer_size = buffer_size self.throttle = throttle self.join = join self.daemonic = daemonic self.using_persistent_queue = queue_path is not None self.pool = create_pool(threads=threads) self.state = CrawlerState() self.started = False # Memory queue if not self.using_persistent_queue: queue = Queue() # Persistent queue else: queue = SQLiteQueue(queue_path, multithreading=True, auto_commit=False) # Creating spiders if spec is not None: if not isinstance(spec, dict): spec = load_definition(spec) if 'spiders' in spec: spiders = {name: DefinitionSpider(s, name=name) for name, s in spec['spiders'].items()} self.single_spider = False else: spiders = {'default': DefinitionSpider(spec)} self.single_spider = True elif spider is not None: spiders = {'default': spider} elif spiders is None: raise TypeError('minet.Crawler: expecting either `spec`, `spider` or `spiders`.') # Solving function spiders for name, s in spiders.items(): if callable(s) and not isinstance(s, Spider): spiders[name] = FunctionSpider(s, name) self.queue = queue self.spiders = spiders
def crawl_action(namespace): # Loading crawler definition queue_path = join(namespace.output_dir, 'queue') definition = load_definition(namespace.crawler) if namespace.resume: print_err('Resuming crawl...') else: rmtree(queue_path, ignore_errors=True) # Scaffolding output directory os.makedirs(namespace.output_dir, exist_ok=True) jobs_output_path = join(namespace.output_dir, 'jobs.csv') jobs_output, jobs_writer = open_report(jobs_output_path, JOBS_HEADERS, resume=namespace.resume) # Creating crawler crawler = Crawler(definition, throttle=namespace.throttle, queue_path=queue_path) reporter_pool = ScraperReporterPool(crawler, namespace.output_dir, resume=namespace.resume) # Loading bar loading_bar = tqdm(desc='Crawling', unit=' pages', dynamic_ncols=True) def update_loading_bar(result): state = crawler.state loading_bar.set_postfix(queue=state.jobs_queued, spider=result.job.spider) loading_bar.update() # Starting crawler crawler.start() # Running crawler for result in crawler: update_loading_bar(result) jobs_writer.writerow(format_job_for_csv(result)) if result.error is not None: continue reporter_pool.write(result.job.spider, result.scraped) loading_bar.close() jobs_output.close() reporter_pool.close()
def scrape_action(namespace): output_file = open_output_file(namespace.output) # Parsing scraper definition try: scraper = load_definition(namespace.scraper) except TypeError: die(['Unknown scraper format.', 'Expecting a JSON or YAML file.']) except: die('Invalid scraper file.') if namespace.format == 'csv': output_headers = headers_from_definition(scraper) output_writer = csv.DictWriter(output_file, fieldnames=output_headers) output_writer.writeheader() else: output_writer = ndjson.writer(output_file) loading_bar = tqdm(desc='Scraping pages', total=namespace.total, dynamic_ncols=True, unit=' pages') loading_bar.set_postfix(p=namespace.processes) if namespace.glob is not None: files = create_glob_iterator(namespace, scraper) else: reader = casanova.reader(namespace.report) files = create_report_iterator(namespace, reader, scraper, loading_bar) with Pool(namespace.processes) as pool: for error, items in pool.imap_unordered(worker, files): loading_bar.update() if not isinstance(items, list): items = [items] for item in items: if not isinstance(item, dict): item = {'value': item} output_writer.writerow(item) output_file.close()
def __init__(self, definition, strain=None): if not isinstance(definition, dict): definition = load_definition(definition) # Validating the definition errors = validate(definition) if errors: raise InvalidScraperError('scraper is invalid', validation_errors=errors) self.definition = definition # Analysis of the definition analysis = analyse(definition) self.headers = analysis.headers self.plural = analysis.plural self.output_type = analysis.output_type # Strainer self.strainer = None if strain is not None: self.strainer = strainer_from_css(strain)
def from_file(target): return Scraper(load_definition(target))
from minet.crawl import crawl from minet.utils import load_definition spider = load_definition('./ftest/crawlers/echojs_multiple.yml') for result in crawl(spider, throttle=2): print(result.job)