Exemplo n.º 1
0
 def close_spider(self, spider):
     # Do compaction each time to save space and also recreate files to
     # avoid them being removed in storage with timestamp-based autoremoval.
     if self.db is not None:
         self.db.compact_range()
         del self.db
     garbage_collect()
Exemplo n.º 2
0
 def close_spider(self, spider):
     # Do compactation each time to save space and also recreate files to
     # avoid them being removed in storages with timestamp-based
     # autoremoval.
     self.db.CompactRange()
     del self.db
     garbage_collect()
Exemplo n.º 3
0
def scrapy():
    try:
        execute()
    finally:
        # Twisted prints errors in DebugInfo.__del__, but PyPy does not run gc.collect()
        # on exit: http://doc.pypy.org/en/latest/cpython_differences.html?highlight=gc.collect#differences-related-to-garbage-collection-strategies
        garbage_collect()
Exemplo n.º 4
0
 def close_spider(self, spider):
     # Do compactation each time to save space and also recreate files to
     # avoid them being removed in storages with timestamp-based autoremoval.
     if self.dbdriver == 'plyvel':
         self.db.compact_range()
     elif self.dbdriver == 'leveldb':
         self.db.CompactRange()
     del self.db
     garbage_collect()
     super(LeveldbCacheStorage, self).close_spider(spider)
Exemplo n.º 5
0
def execute_crawler(identifier):
    # Runs a crawler from command-line (not working)
    configure_logging()
    settings = project.get_project_settings()

    try:
        crawl(spider, settings)
        reactor.run()
    finally:
        garbage_collect()
Exemplo n.º 6
0
    def test_cache_with_limit(self):
        cache = LocalWeakReferencedCache(limit=2)
        r1 = Request('https://example.org')
        r2 = Request('https://example.com')
        r3 = Request('https://example.net')
        cache[r1] = 1
        cache[r2] = 2
        cache[r3] = 3
        self.assertEqual(len(cache), 2)
        self.assertNotIn(r1, cache)
        self.assertIn(r2, cache)
        self.assertIn(r3, cache)
        self.assertEqual(cache[r2], 2)
        self.assertEqual(cache[r3], 3)
        del r2

        # PyPy takes longer to collect dead references
        garbage_collect()

        self.assertEqual(len(cache), 1)
    def test_cache_without_limit(self):
        max = 10**4
        cache = LocalWeakReferencedCache()
        refs = []
        for x in range(max):
            refs.append(Request(f'https://example.org/{x}'))
            cache[refs[-1]] = x
        self.assertEqual(len(cache), max)
        for i, r in enumerate(refs):
            self.assertIn(r, cache)
            self.assertEqual(cache[r], i)
        del r  # delete reference to the last object in the list

        # delete half of the objects, make sure that is reflected in the cache
        for _ in range(max // 2):
            refs.pop()

        # PyPy takes longer to collect dead references
        garbage_collect()

        self.assertEqual(len(cache), max // 2)
        for i, r in enumerate(refs):
            self.assertIn(r, cache)
            self.assertEqual(cache[r], i)
Exemplo n.º 8
0

def _run_command(cmd, args, opts):
    if opts.profile:
        _run_command_profiled(cmd, args, opts)
    else:
        cmd.run(args, opts)


# 用cpython的porfiler 运行 cmd.run(args, opts) 并传入变量
def _run_command_profiled(cmd, args, opts):
    if opts.profile:
        sys.stderr.write(
            f"scrapy: writing cProfile stats to {opts.profile!r}\n")
    loc = locals()  #拿到所有变量
    p = cProfile.Profile()
    p.runctx('cmd.run(args, opts)', globals(),
             loc)  #调用函数 并传入 global 和 locals 中的变量
    if opts.profile:
        p.dump_stats(opts.profile)


if __name__ == '__main__':
    try:
        execute()
    finally:
        # Twisted prints errors in DebugInfo.__del__, but PyPy does not run gc.collect() on exit:
        # http://doc.pypy.org/en/latest/cpython_differences.html
        # ?highlight=gc.collect#differences-related-to-garbage-collection-strategies
        garbage_collect()
Exemplo n.º 9
0
def main():
    """Command line entry point."""

    settings = get_project_settings()
    configure_logging(settings)

    args = _parse_args()
    LOGGER.info(args)

    base_dir = Path(settings["BASE_DIR"]).resolve()
    cache_dir = base_dir / ".scrapy" / "httpcache"
    feeds_dir = Path(args.feeds_dir) if args.feeds_dir else base_dir / "feeds"
    feeds_dir = feeds_dir.resolve()
    feeds_dir_scraper = (
        feeds_dir / args.feeds_subdir if args.feeds_subdir else feeds_dir / args.spider
    )
    file_tag = normalize_space(args.file_tag)
    out_file = feeds_dir_scraper / "%(class)s" / f"%(time)s{file_tag}.jl"

    LOGGER.info("Output file will be <%s>", out_file)

    from_settings = job_dir_from_settings(settings)
    job_dir = (
        Path(args.job_dir)
        if args.job_dir
        else Path(from_settings)
        if from_settings
        else base_dir / "jobs" / args.spider
    )
    job_dir = job_dir.resolve()

    cache_dir.mkdir(parents=True, exist_ok=True)
    feeds_dir_scraper.mkdir(parents=True, exist_ok=True)
    job_dir.mkdir(parents=True, exist_ok=True)

    dont_run_before_file = job_dir / ".dont_run_before"
    dont_run_before = parse_date(
        args.dont_run_before, tzinfo=timezone.utc
    ) or date_from_file(dont_run_before_file, tzinfo=timezone.utc)

    if dont_run_before:
        LOGGER.info("Don't run before %s", dont_run_before.isoformat())
        sleep_seconds = dont_run_before.timestamp() - now().timestamp()
        if sleep_seconds > 0:
            LOGGER.info("Going to sleep for %.1f seconds", sleep_seconds)
            sleep(sleep_seconds)

    states = _find_states(
        job_dir, state_file=settings.get("STATE_TAG_FILE") or ".state"
    )

    running = sorted(sub_dir for sub_dir, state in states.items() if state == "running")

    if len(running) > 1:
        LOGGER.warning(
            "Found %d running jobs %s, please check and fix!", len(running), running
        )
        return

    if running:
        LOGGER.info("Found a running job <%s>, skipping...", running[0])
        return

    resumable = sorted(
        sub_dir for sub_dir, state in states.items() if state in RESUMABLE_STATES
    )

    if len(resumable) > 1:
        LOGGER.warning(
            "Found %d resumable jobs %s, please check and fix!",
            len(resumable),
            resumable,
        )
        return

    if resumable:
        LOGGER.info("Resuming previous job <%s>", resumable[0])

    job_tag = resumable[0] if resumable else now().strftime(DATE_FORMAT)
    curr_job = job_dir / job_tag

    command = [
        "scrapy",
        "crawl",
        args.spider,
        "--output",
        str(out_file),
        "--set",
        f"JOBDIR={curr_job}",
        "--set",
        f"DONT_RUN_BEFORE_FILE={dont_run_before_file}",
    ]

    try:
        execute(argv=command)
    finally:
        garbage_collect()
Exemplo n.º 10
0
 def close_spider(self, spider):
     # Do compactation each time to save space and also recreate files to
     # avoid them being removed in storages with timestamp-based autoremoval.
     self.db.CompactRange()
     del self.db
     garbage_collect()
Exemplo n.º 11
0
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)

    cmd.crawler_process = CrawlerProcess(settings)
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)

def _run_command(cmd, args, opts):
    if opts.profile:
        _run_command_profiled(cmd, args, opts)
    else:
        cmd.run(args, opts)

def _run_command_profiled(cmd, args, opts):
    if opts.profile:
        sys.stderr.write("scrapy: writing cProfile stats to %r\n" % opts.profile)
    loc = locals()
    p = cProfile.Profile()
    p.runctx('cmd.run(args, opts)', globals(), loc)
    if opts.profile:
        p.dump_stats(opts.profile)

if __name__ == '__main__':
    try:
        execute()
    finally:
        # Twisted prints errors in DebugInfo.__del__, but PyPy does not run gc.collect()
        # on exit: http://doc.pypy.org/en/latest/cpython_differences.html?highlight=gc.collect#differences-related-to-garbage-collection-strategies
        garbage_collect()