def crawl_command(args): repo = from_url(args.repository) session = Session(args.session_id) stage = Stage(session, repo) with stage: opml = stage.subscriptions if not opml: print('OPML does not exist in the repository', file=sys.stderr) return urllist = [subscription.feed_uri for subscription in opml.recursive_subscriptions] threads_count = args.threads if args.threads is not None else cpu_count() generator = crawl(urllist, threads_count) try: for feed_url, feed_data, crawler_hints in generator: if args.verbose: print('{0.title} - {1} entries'.format( feed_data, len(feed_data.entries) )) with stage: feed_id = hashlib.sha1(feed_url).hexdigest() stage.feeds[feed_id] = feed_data except CrawlError as e: print(e, file=sys.stderr)
def crawl_command(args): repo = from_url(args.repository) session = Session(args.session_id) stage = Stage(session, repo) with stage: opml = stage.subscriptions if not opml: print('OPML does not exist in the repository', file=sys.stderr) return feed_id = args.feed_id if feed_id: feed_map = dict((sub.feed_uri, sub.feed_id) for sub in opml.recursive_subscriptions if sub.feed_id == feed_id) if not feed_map: print('There is no such feed:', feed_id, file=sys.stderr) return else: feed_map = dict((sub.feed_uri, sub.feed_id) for sub in opml.recursive_subscriptions) if not feed_map: print('No feeds to crawl', file=sys.stderr) return threads_count = args.threads if args.threads is not None else cpu_count() iterator = iter(crawl(feed_map.keys(), threads_count)) while 1: try: feed_url, feed_data, crawler_hints = next(iterator) if args.verbose: print('{0.title} - {1} entries'.format( feed_data, len(feed_data.entries) )) with stage: feed_id = feed_map[feed_url] stage.feeds[feed_id] = feed_data except (CrawlError, SchemaError) as e: if isinstance(e, CrawlError): print('Something went wrong with', e.feed_uri, file=sys.stderr) if args.verbose: traceback.print_exc() else: print(e, file=sys.stderr) except StopIteration: break
def crawl_command(args): repo = from_url(args.repository) session = Session(args.session_id) stage = Stage(session, repo) with stage: opml = stage.subscriptions if not opml: print('OPML does not exist in the repository', file=sys.stderr) return feed_id = args.feed_id if feed_id: feed_map = dict((sub.feed_uri, sub.feed_id) for sub in opml.recursive_subscriptions if sub.feed_id == feed_id) if not feed_map: print('There is no such feed:', feed_id, file=sys.stderr) return else: feed_map = dict((sub.feed_uri, sub.feed_id) for sub in opml.recursive_subscriptions) if not feed_map: print('No feeds to crawl', file=sys.stderr) return threads_count = args.threads if args.threads is not None else cpu_count() iterator = iter(crawl(feed_map.keys(), threads_count)) while 1: try: feed_url, feed_data, crawler_hints = next(iterator) if args.verbose: print('{0.title} - {1} entries'.format(feed_data, len(feed_data.entries))) with stage: feed_id = feed_map[feed_url] stage.feeds[feed_id] = feed_data except (CrawlError, SchemaError) as e: if isinstance(e, CrawlError): print('Something went wrong with', e.feed_uri, file=sys.stderr) if args.verbose: traceback.print_exc() else: print(e, file=sys.stderr) except StopIteration: break
def test_cpu_count(): assert isinstance(cpu_count(), numbers.Integral) assert 0 < cpu_count()