Exemplo n.º 1
0
    def __init__(self, spec=None, spider=None, spiders=None, start_jobs=None,
                 queue_path=None, threads=25,
                 buffer_size=DEFAULT_IMAP_BUFFER_SIZE, throttle=DEFAULT_THROTTLE,
                 join=True, daemonic=False):

        # NOTE: crawling could work depth-first but:
        # buffer_size should be 0 (requires to fix quenouille issue #1)

        # Params
        self.start_jobs = start_jobs
        self.queue_path = queue_path
        self.threads = threads
        self.buffer_size = buffer_size
        self.throttle = throttle
        self.join = join
        self.daemonic = daemonic

        self.using_persistent_queue = queue_path is not None
        self.pool = create_pool(threads=threads)
        self.state = CrawlerState()
        self.started = False

        # Memory queue
        if not self.using_persistent_queue:
            queue = Queue()

        # Persistent queue
        else:
            queue = SQLiteQueue(queue_path, multithreading=True, auto_commit=False)

        # Creating spiders
        if spec is not None:
            if not isinstance(spec, dict):
                spec = load_definition(spec)

            if 'spiders' in spec:
                spiders = {name: DefinitionSpider(s, name=name) for name, s in spec['spiders'].items()}
                self.single_spider = False
            else:
                spiders = {'default': DefinitionSpider(spec)}
                self.single_spider = True

        elif spider is not None:
            spiders = {'default': spider}

        elif spiders is None:
            raise TypeError('minet.Crawler: expecting either `spec`, `spider` or `spiders`.')

        # Solving function spiders
        for name, s in spiders.items():
            if callable(s) and not isinstance(s, Spider):
                spiders[name] = FunctionSpider(s, name)

        self.queue = queue
        self.spiders = spiders
Exemplo n.º 2
0
def crawl_action(namespace):

    # Loading crawler definition
    queue_path = join(namespace.output_dir, 'queue')
    definition = load_definition(namespace.crawler)

    if namespace.resume:
        print_err('Resuming crawl...')
    else:
        rmtree(queue_path, ignore_errors=True)

    # Scaffolding output directory
    os.makedirs(namespace.output_dir, exist_ok=True)

    jobs_output_path = join(namespace.output_dir, 'jobs.csv')
    jobs_output, jobs_writer = open_report(jobs_output_path,
                                           JOBS_HEADERS,
                                           resume=namespace.resume)

    # Creating crawler
    crawler = Crawler(definition,
                      throttle=namespace.throttle,
                      queue_path=queue_path)

    reporter_pool = ScraperReporterPool(crawler,
                                        namespace.output_dir,
                                        resume=namespace.resume)

    # Loading bar
    loading_bar = tqdm(desc='Crawling', unit=' pages', dynamic_ncols=True)

    def update_loading_bar(result):
        state = crawler.state

        loading_bar.set_postfix(queue=state.jobs_queued,
                                spider=result.job.spider)
        loading_bar.update()

    # Starting crawler
    crawler.start()

    # Running crawler
    for result in crawler:
        update_loading_bar(result)
        jobs_writer.writerow(format_job_for_csv(result))

        if result.error is not None:
            continue

        reporter_pool.write(result.job.spider, result.scraped)

    loading_bar.close()
    jobs_output.close()
    reporter_pool.close()
Exemplo n.º 3
0
def scrape_action(namespace):

    output_file = open_output_file(namespace.output)

    # Parsing scraper definition
    try:
        scraper = load_definition(namespace.scraper)
    except TypeError:
        die(['Unknown scraper format.', 'Expecting a JSON or YAML file.'])
    except:
        die('Invalid scraper file.')

    if namespace.format == 'csv':
        output_headers = headers_from_definition(scraper)
        output_writer = csv.DictWriter(output_file, fieldnames=output_headers)
        output_writer.writeheader()
    else:
        output_writer = ndjson.writer(output_file)

    loading_bar = tqdm(desc='Scraping pages',
                       total=namespace.total,
                       dynamic_ncols=True,
                       unit=' pages')

    loading_bar.set_postfix(p=namespace.processes)

    if namespace.glob is not None:
        files = create_glob_iterator(namespace, scraper)
    else:
        reader = casanova.reader(namespace.report)
        files = create_report_iterator(namespace, reader, scraper, loading_bar)

    with Pool(namespace.processes) as pool:
        for error, items in pool.imap_unordered(worker, files):
            loading_bar.update()

            if not isinstance(items, list):
                items = [items]

            for item in items:
                if not isinstance(item, dict):
                    item = {'value': item}

                output_writer.writerow(item)

    output_file.close()
Exemplo n.º 4
0
    def __init__(self, definition, strain=None):
        if not isinstance(definition, dict):
            definition = load_definition(definition)

        # Validating the definition
        errors = validate(definition)

        if errors:
            raise InvalidScraperError('scraper is invalid', validation_errors=errors)

        self.definition = definition

        # Analysis of the definition
        analysis = analyse(definition)

        self.headers = analysis.headers
        self.plural = analysis.plural
        self.output_type = analysis.output_type

        # Strainer
        self.strainer = None

        if strain is not None:
            self.strainer = strainer_from_css(strain)
Exemplo n.º 5
0
 def from_file(target):
     return Scraper(load_definition(target))
Exemplo n.º 6
0
from minet.crawl import crawl
from minet.utils import load_definition

spider = load_definition('./ftest/crawlers/echojs_multiple.yml')

for result in crawl(spider, throttle=2):
    print(result.job)